This patch adds support for the __arm_locally_streaming attribute, which allows a function to use SME internally without changing the function's ABI. The attribute is valid but redundant for __arm_streaming functions.
gcc/ * config/aarch64/aarch64.cc (aarch64_arm_attribute_table): Add arm::locally_streaming. (aarch64_fndecl_is_locally_streaming): New function. (aarch64_fndecl_sm_state): Handle locally-streaming functions. (aarch64_cfun_enables_pstate_sm): New function. (aarch64_add_offset): Add an argument that specifies whether the streaming vector length should be used instead of the prevailing one. (aarch64_split_add_offset, aarch64_add_sp, aarch64_sub_sp): Likewise. (aarch64_allocate_and_probe_stack_space): Likewise. (aarch64_expand_mov_immediate): Update calls accordingly. (aarch64_need_old_pstate_sm): Return true for locally-streaming streaming-compatible functions. (aarch64_layout_frame): Force all call-preserved Z and P registers to be saved and restored if the function switches PSTATE.SM in the prologue. (aarch64_get_separate_components): Disable shrink-wrapping of such Z and P saves and restores. (aarch64_use_late_prologue_epilogue): New function. (aarch64_expand_prologue): Measure SVE lengths in the streaming vector length for locally-streaming functions, then emit code to enable streaming mode. (aarch64_expand_epilogue): Likewise in reverse. (TARGET_USE_LATE_PROLOGUE_EPILOGUE): Define. * config/aarch64/aarch64-c.cc (aarch64_define_unconditional_macros): Define __arm_locally_streaming. gcc/testsuite/ * gcc.target/aarch64/sme/locally_streaming_1.c: New test. * gcc.target/aarch64/sme/locally_streaming_2.c: Likewise. * gcc.target/aarch64/sme/locally_streaming_3.c: Likewise. * gcc.target/aarch64/sme/locally_streaming_4.c: Likewise. * gcc.target/aarch64/sme/keyword_macros_1.c: Add __arm_locally_streaming. * g++.target/aarch64/sme/keyword_macros_1.C: Likewise. --- gcc/config/aarch64/aarch64-c.cc | 1 + gcc/config/aarch64/aarch64.cc | 233 +++++++-- .../g++.target/aarch64/sme/keyword_macros_1.C | 1 + .../gcc.target/aarch64/sme/keyword_macros_1.c | 1 + .../aarch64/sme/locally_streaming_1.c | 466 ++++++++++++++++++ .../aarch64/sme/locally_streaming_2.c | 177 +++++++ .../aarch64/sme/locally_streaming_3.c | 273 ++++++++++ .../aarch64/sme/locally_streaming_4.c | 145 ++++++ 8 files changed, 1259 insertions(+), 38 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_2.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_3.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_4.c diff --git a/gcc/config/aarch64/aarch64-c.cc b/gcc/config/aarch64/aarch64-c.cc index f2fa5df1b82..2a8ca46987a 100644 --- a/gcc/config/aarch64/aarch64-c.cc +++ b/gcc/config/aarch64/aarch64-c.cc @@ -86,6 +86,7 @@ aarch64_define_unconditional_macros (cpp_reader *pfile) DEFINE_ARM_KEYWORD_MACRO ("streaming"); DEFINE_ARM_KEYWORD_MACRO ("streaming_compatible"); + DEFINE_ARM_KEYWORD_MACRO ("locally_streaming"); #undef DEFINE_ARM_KEYWORD_MACRO diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 12753ac133e..6ad29a3a84f 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -3136,6 +3136,7 @@ static const attribute_spec aarch64_arm_attributes[] = NULL, attr_streaming_exclusions }, { "streaming_compatible", 0, 0, false, true, true, true, NULL, attr_streaming_exclusions }, + { "locally_streaming", 0, 0, true, false, false, false, NULL, NULL }, { "new", 1, -1, true, false, false, false, handle_arm_new, NULL }, { "preserves", 1, -1, false, true, true, true, @@ -4445,6 +4446,16 @@ aarch64_fntype_isa_mode (const_tree fntype) | aarch64_fntype_pstate_za (fntype)); } +/* Return true if FNDECL uses streaming mode internally, as an + implementation choice. */ + +static bool +aarch64_fndecl_is_locally_streaming (const_tree fndecl) +{ + return lookup_attribute ("arm", "locally_streaming", + DECL_ATTRIBUTES (fndecl)); +} + /* Return the state of PSTATE.SM when compiling the body of function FNDECL. This might be different from the state of PSTATE.SM on entry. */ @@ -4452,6 +4463,9 @@ aarch64_fntype_isa_mode (const_tree fntype) static aarch64_feature_flags aarch64_fndecl_pstate_sm (const_tree fndecl) { + if (aarch64_fndecl_is_locally_streaming (fndecl)) + return AARCH64_FL_SM_ON; + return aarch64_fntype_pstate_sm (TREE_TYPE (fndecl)); } @@ -4527,6 +4541,16 @@ aarch64_cfun_has_new_state (const char *state_name) return aarch64_fndecl_has_new_state (cfun->decl, state_name); } +/* Return true if PSTATE.SM is 1 in the body of the current function, + but is not guaranteed to be 1 on entry. */ + +static bool +aarch64_cfun_enables_pstate_sm () +{ + return (aarch64_fndecl_is_locally_streaming (cfun->decl) + && aarch64_cfun_incoming_pstate_sm () != AARCH64_FL_SM_ON); +} + /* Return true if the current function has state STATE_NAME, either by creating new state itself or by sharing state with callers. */ @@ -6768,6 +6792,10 @@ aarch64_add_offset_temporaries (rtx x) TEMP2, if nonnull, is a second temporary register that doesn't overlap either DEST or REG. + FORCE_ISA_MODE is AARCH64_FL_SM_ON if any variable component of OFFSET + is measured relative to the SME vector length instead of the current + prevailing vector length. It is 0 otherwise. + Since this function may be used to adjust the stack pointer, we must ensure that it cannot cause transient stack deallocation (for example by first incrementing SP and then decrementing when adjusting by a @@ -6776,6 +6804,7 @@ aarch64_add_offset_temporaries (rtx x) static void aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src, poly_int64 offset, rtx temp1, rtx temp2, + aarch64_feature_flags force_isa_mode, bool frame_related_p, bool emit_move_imm = true) { gcc_assert (emit_move_imm || temp1 != NULL_RTX); @@ -6788,9 +6817,18 @@ aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src, /* Try using ADDVL or ADDPL to add the whole value. */ if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset)) { - rtx offset_rtx = gen_int_mode (offset, mode); + gcc_assert (offset.coeffs[0] == offset.coeffs[1]); + rtx offset_rtx; + if (force_isa_mode == 0) + offset_rtx = gen_int_mode (offset, mode); + else + offset_rtx = aarch64_sme_vq_immediate (mode, offset.coeffs[0], 0); rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx)); RTX_FRAME_RELATED_P (insn) = frame_related_p; + if (frame_related_p && (force_isa_mode & AARCH64_FL_SM_ON)) + add_reg_note (insn, REG_CFA_ADJUST_CFA, + gen_rtx_SET (dest, plus_constant (Pmode, src, + offset))); return; } @@ -6806,11 +6844,19 @@ aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src, if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (poly_offset)) { - rtx offset_rtx = gen_int_mode (poly_offset, mode); + rtx offset_rtx; + if (force_isa_mode == 0) + offset_rtx = gen_int_mode (poly_offset, mode); + else + offset_rtx = aarch64_sme_vq_immediate (mode, factor, 0); if (frame_related_p) { rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx)); RTX_FRAME_RELATED_P (insn) = true; + if (force_isa_mode & AARCH64_FL_SM_ON) + add_reg_note (insn, REG_CFA_ADJUST_CFA, + gen_rtx_SET (dest, plus_constant (Pmode, src, + poly_offset))); src = dest; } else @@ -6841,9 +6887,19 @@ aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src, rtx val; if (IN_RANGE (rel_factor, -32, 31)) { + if (force_isa_mode & AARCH64_FL_SM_ON) + { + /* Try to use an unshifted RDSVL, otherwise fall back on + a shifted RDSVL #1. */ + if (aarch64_sve_rdvl_addvl_factor_p (factor)) + shift = 0; + else + factor = rel_factor * 16; + val = aarch64_sme_vq_immediate (mode, factor, 0); + } /* Try to use an unshifted CNT[BHWD] or RDVL. */ - if (aarch64_sve_cnt_factor_p (factor) - || aarch64_sve_rdvl_addvl_factor_p (factor)) + else if (aarch64_sve_cnt_factor_p (factor) + || aarch64_sve_rdvl_addvl_factor_p (factor)) { val = gen_int_mode (poly_int64 (factor, factor), mode); shift = 0; @@ -6873,11 +6929,18 @@ aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src, a shift and add sequence for the multiplication. If CNTB << SHIFT is out of range, stick with the current shift factor. */ - if (IN_RANGE (low_bit, 2, 16 * 16)) + if (force_isa_mode == 0 + && IN_RANGE (low_bit, 2, 16 * 16)) { val = gen_int_mode (poly_int64 (low_bit, low_bit), mode); shift = 0; } + else if ((force_isa_mode & AARCH64_FL_SM_ON) + && aarch64_sve_rdvl_addvl_factor_p (low_bit)) + { + val = aarch64_sme_vq_immediate (mode, low_bit, 0); + shift = 0; + } else val = gen_int_mode (BYTES_PER_SVE_VECTOR, mode); @@ -6965,30 +7028,34 @@ aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src, rtx offset_rtx, rtx temp1, rtx temp2) { aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx), - temp1, temp2, false); + temp1, temp2, 0, false); } /* Add DELTA to the stack pointer, marking the instructions frame-related. - TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false - if TEMP1 already contains abs (DELTA). */ + TEMP1 is available as a temporary if nonnull. FORCE_ISA_MODE is as + for aarch64_add_offset. EMIT_MOVE_IMM is false if TEMP1 already + contains abs (DELTA). */ static inline void -aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm) +aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, + aarch64_feature_flags force_isa_mode, bool emit_move_imm) { aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta, - temp1, temp2, true, emit_move_imm); + temp1, temp2, force_isa_mode, true, emit_move_imm); } /* Subtract DELTA from the stack pointer, marking the instructions - frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary - if nonnull. */ + frame-related if FRAME_RELATED_P. FORCE_ISA_MODE is as for + aarch64_add_offset. TEMP1 is available as a temporary if nonnull. */ static inline void -aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p, - bool emit_move_imm = true) +aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, + aarch64_feature_flags force_isa_mode, + bool frame_related_p, bool emit_move_imm = true) { aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta, - temp1, temp2, frame_related_p, emit_move_imm); + temp1, temp2, force_isa_mode, frame_related_p, + emit_move_imm); } /* A streaming-compatible function needs to switch temporarily to the known @@ -8014,11 +8081,11 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm) { base = aarch64_force_temporary (int_mode, dest, base); aarch64_add_offset (int_mode, dest, base, offset, - NULL_RTX, NULL_RTX, false); + NULL_RTX, NULL_RTX, 0, false); } else aarch64_add_offset (int_mode, dest, base, offset, - dest, NULL_RTX, false); + dest, NULL_RTX, 0, false); } return; } @@ -8045,7 +8112,7 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm) gcc_assert (can_create_pseudo_p ()); base = aarch64_force_temporary (int_mode, dest, base); aarch64_add_offset (int_mode, dest, base, const_offset, - NULL_RTX, NULL_RTX, false); + NULL_RTX, NULL_RTX, 0, false); return; } @@ -8085,7 +8152,7 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm) gcc_assert(can_create_pseudo_p ()); base = aarch64_force_temporary (int_mode, dest, base); aarch64_add_offset (int_mode, dest, base, const_offset, - NULL_RTX, NULL_RTX, false); + NULL_RTX, NULL_RTX, 0, false); return; } /* FALLTHRU */ @@ -9728,6 +9795,9 @@ aarch64_need_old_pstate_sm () if (aarch64_cfun_incoming_pstate_sm () != 0) return false; + if (aarch64_cfun_enables_pstate_sm ()) + return true; + if (cfun->machine->call_switches_pstate_sm) for (auto insn = get_insns (); insn; insn = NEXT_INSN (insn)) if (auto *call = dyn_cast<rtx_call_insn *> (insn)) @@ -9754,6 +9824,7 @@ aarch64_layout_frame (void) bool frame_related_fp_reg_p = false; aarch64_frame &frame = cfun->machine->frame; poly_int64 top_of_locals = -1; + bool enables_pstate_sm = aarch64_cfun_enables_pstate_sm (); vec_safe_truncate (frame.saved_gprs, 0); vec_safe_truncate (frame.saved_fprs, 0); @@ -9791,7 +9862,7 @@ aarch64_layout_frame (void) frame.reg_offset[regno] = SLOT_REQUIRED; for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++) - if (df_regs_ever_live_p (regno) + if ((enables_pstate_sm || df_regs_ever_live_p (regno)) && !fixed_regs[regno] && !crtl->abi->clobbers_full_reg_p (regno)) { @@ -9820,7 +9891,7 @@ aarch64_layout_frame (void) } for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++) - if (df_regs_ever_live_p (regno) + if ((enables_pstate_sm || df_regs_ever_live_p (regno)) && !fixed_regs[regno] && !crtl->abi->clobbers_full_reg_p (regno)) frame.reg_offset[regno] = SLOT_REQUIRED; @@ -9937,7 +10008,8 @@ aarch64_layout_frame (void) /* If the current function changes the SVE vector length, ensure that the old value of the DWARF VG register is saved and available in the CFI, so that outer frames with VL-sized offsets can be processed correctly. */ - if (cfun->machine->call_switches_pstate_sm) + if (cfun->machine->call_switches_pstate_sm + || aarch64_cfun_enables_pstate_sm ()) { frame.reg_offset[VG_REGNUM] = offset; offset += UNITS_PER_WORD; @@ -10776,9 +10848,16 @@ aarch64_get_separate_components (void) bitmap_clear (components); /* The registers we need saved to the frame. */ + bool enables_pstate_sm = aarch64_cfun_enables_pstate_sm (); for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++) if (aarch64_register_saved_on_entry (regno)) { + /* Disallow shrink wrapping for registers that will be clobbered + by an SMSTART SM in the prologue. */ + if (enables_pstate_sm + && (FP_REGNUM_P (regno) || PR_REGNUM_P (regno))) + continue; + /* Punt on saves and restores that use ST1D and LD1D. We could try to be smarter, but it would involve making sure that the spare predicate register itself is safe to use at the save @@ -11097,11 +11176,16 @@ aarch64_emit_stack_tie (rtx reg) events, e.g. if we were to allow the stack to be dropped by more than a page and then have multiple probes up and we take a signal somewhere in between then the signal handler doesn't know the state of the stack and can make no - assumptions about which pages have been probed. */ + assumptions about which pages have been probed. + + FORCE_ISA_MODE is AARCH64_FL_SM_ON if any variable component of POLY_SIZE + is measured relative to the SME vector length instead of the current + prevailing vector length. It is 0 otherwise. */ static void aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, poly_int64 poly_size, + aarch64_feature_flags force_isa_mode, bool frame_related_p, bool final_adjustment_p) { @@ -11143,7 +11227,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, if (known_lt (poly_size, min_probe_threshold) || !flag_stack_clash_protection) { - aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p); + aarch64_sub_sp (temp1, temp2, poly_size, force_isa_mode, + frame_related_p); return; } @@ -11160,7 +11245,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, /* First calculate the amount of bytes we're actually spilling. */ aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode), - poly_size, temp1, temp2, false, true); + poly_size, temp1, temp2, force_isa_mode, + false, true); rtx_insn *insn = get_last_insn (); @@ -11218,7 +11304,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, { for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size) { - aarch64_sub_sp (NULL, temp2, guard_size, true); + aarch64_sub_sp (NULL, temp2, guard_size, force_isa_mode, true); emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx, guard_used_by_caller)); emit_insn (gen_blockage ()); @@ -11229,7 +11315,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, { /* Compute the ending address. */ aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size, - temp1, NULL, false, true); + temp1, NULL, force_isa_mode, false, true); rtx_insn *insn = get_last_insn (); /* For the initial allocation, we don't have a frame pointer @@ -11295,7 +11381,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, if (final_adjustment_p && rounded_size != 0) min_probe_threshold = 0; - aarch64_sub_sp (temp1, temp2, residual, frame_related_p); + aarch64_sub_sp (temp1, temp2, residual, force_isa_mode, frame_related_p); if (residual >= min_probe_threshold) { if (dump_file) @@ -11360,6 +11446,14 @@ aarch64_epilogue_uses (int regno) return 0; } +/* Implement TARGET_USE_LATE_PROLOGUE_EPILOGUE. */ + +static bool +aarch64_use_late_prologue_epilogue () +{ + return aarch64_cfun_enables_pstate_sm (); +} + /* The current function's frame has a save slot for the incoming state of SVCR. Return a legitimate memory for the slot, based on the hard frame pointer. */ @@ -11496,6 +11590,9 @@ aarch64_expand_prologue (void) unsigned reg2 = frame.wb_push_candidate2; bool emit_frame_chain = frame.emit_frame_chain; rtx_insn *insn; + aarch64_feature_flags force_isa_mode = 0; + if (aarch64_cfun_enables_pstate_sm ()) + force_isa_mode = AARCH64_FL_SM_ON; if (flag_stack_clash_protection && known_eq (callee_adjust, 0)) { @@ -11557,7 +11654,7 @@ aarch64_expand_prologue (void) less the amount of the guard reserved for use by the caller's outgoing args. */ aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust, - true, false); + force_isa_mode, true, false); if (callee_adjust != 0) aarch64_push_regs (reg1, reg2, callee_adjust); @@ -11580,7 +11677,8 @@ aarch64_expand_prologue (void) gcc_assert (known_eq (chain_offset, 0)); aarch64_add_offset (Pmode, hard_frame_pointer_rtx, stack_pointer_rtx, chain_offset, - tmp1_rtx, tmp0_rtx, frame_pointer_needed); + tmp1_rtx, tmp0_rtx, force_isa_mode, + frame_pointer_needed); if (frame_pointer_needed && !frame_size.is_constant ()) { /* Variable-sized frames need to describe the save slot @@ -11627,6 +11725,7 @@ aarch64_expand_prologue (void) || known_eq (initial_adjust, 0)); aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, sve_callee_adjust, + force_isa_mode, !frame_pointer_needed, false); bytes_below_sp -= sve_callee_adjust; } @@ -11639,12 +11738,15 @@ aarch64_expand_prologue (void) that is assumed by the called. */ gcc_assert (known_eq (bytes_below_sp, final_adjust)); aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust, + force_isa_mode, !frame_pointer_needed, true); if (emit_frame_chain && maybe_ne (final_adjust, 0)) aarch64_emit_stack_tie (hard_frame_pointer_rtx); - /* Save the incoming value of PSTATE.SM, if required. */ - if (known_ge (frame.old_svcr_offset, 0)) + /* Save the incoming value of PSTATE.SM, if required. Code further + down does this for locally-streaming functions. */ + if (known_ge (frame.old_svcr_offset, 0) + && !aarch64_cfun_enables_pstate_sm ()) { rtx mem = aarch64_old_svcr_mem (); MEM_VOLATILE_P (mem) = 1; @@ -11676,6 +11778,34 @@ aarch64_expand_prologue (void) emit_move_insn (gen_rtx_REG (DImode, R1_REGNUM), old_r1); } } + + /* Enable PSTATE.SM, if required. */ + if (aarch64_cfun_enables_pstate_sm ()) + { + rtx_insn *guard_label = nullptr; + if (known_ge (cfun->machine->frame.old_svcr_offset, 0)) + { + /* The current function is streaming-compatible. Save the + original state of PSTATE.SM. */ + rtx svcr = gen_rtx_REG (DImode, IP0_REGNUM); + emit_insn (gen_aarch64_read_svcr (svcr)); + emit_move_insn (aarch64_old_svcr_mem (), svcr); + guard_label = aarch64_guard_switch_pstate_sm (svcr, + aarch64_isa_flags); + } + aarch64_sme_mode_switch_regs args_switch; + auto &args = crtl->args.info; + for (unsigned int i = 0; i < args.num_sme_mode_switch_args; ++i) + { + rtx x = args.sme_mode_switch_args[i]; + args_switch.add_reg (GET_MODE (x), REGNO (x)); + } + args_switch.emit_prologue (); + emit_insn (gen_aarch64_smstart_sm ()); + args_switch.emit_epilogue (); + if (guard_label) + emit_label (guard_label); + } } /* Return TRUE if we can use a simple_return insn. @@ -11722,6 +11852,9 @@ aarch64_expand_epilogue (rtx_call_insn *sibcall) HOST_WIDE_INT guard_size = 1 << param_stack_clash_protection_guard_size; HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD; + aarch64_feature_flags force_isa_mode = 0; + if (aarch64_cfun_enables_pstate_sm ()) + force_isa_mode = AARCH64_FL_SM_ON; /* We can re-use the registers when: @@ -11746,6 +11879,24 @@ aarch64_expand_epilogue (rtx_call_insn *sibcall) = maybe_ne (get_frame_size () + frame.saved_varargs_size, 0); + /* Reset PSTATE.SM, if required. */ + if (aarch64_cfun_enables_pstate_sm ()) + { + rtx_insn *guard_label = nullptr; + if (known_ge (cfun->machine->frame.old_svcr_offset, 0)) + guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM, + aarch64_isa_flags); + aarch64_sme_mode_switch_regs return_switch; + if (crtl->return_rtx && REG_P (crtl->return_rtx)) + return_switch.add_reg (GET_MODE (crtl->return_rtx), + REGNO (crtl->return_rtx)); + return_switch.emit_prologue (); + emit_insn (gen_aarch64_smstop_sm ()); + return_switch.emit_epilogue (); + if (guard_label) + emit_label (guard_label); + } + /* Emit a barrier to prevent loads from a deallocated stack. */ if (maybe_gt (final_adjust, crtl->outgoing_args_size) || cfun->calls_alloca @@ -11766,19 +11917,21 @@ aarch64_expand_epilogue (rtx_call_insn *sibcall) aarch64_add_offset (Pmode, stack_pointer_rtx, hard_frame_pointer_rtx, -bytes_below_hard_fp + final_adjust, - tmp1_rtx, tmp0_rtx, callee_adjust == 0); + tmp1_rtx, tmp0_rtx, force_isa_mode, + callee_adjust == 0); else /* The case where we need to re-use the register here is very rare, so avoid the complicated condition and just always emit a move if the immediate doesn't fit. */ - aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true); + aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, force_isa_mode, true); /* Restore the vector registers before the predicate registers, so that we can use P4 as a temporary for big-endian SVE frames. */ aarch64_restore_callee_saves (final_adjust, frame.saved_fprs, &cfi_ops); aarch64_restore_callee_saves (final_adjust, frame.saved_prs, &cfi_ops); if (maybe_ne (sve_callee_adjust, 0)) - aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true); + aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, + force_isa_mode, true); /* When shadow call stack is enabled, the scs_pop in the epilogue will restore x30, we don't need to restore x30 again in the traditional @@ -11808,7 +11961,7 @@ aarch64_expand_epilogue (rtx_call_insn *sibcall) /* Liveness of EP0_REGNUM can not be trusted across function calls either, so add restriction on emit_move optimization to leaf functions. */ - aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust, + aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust, force_isa_mode, (!can_inherit_p || !crtl->is_leaf || df_regs_ever_live_p (EP0_REGNUM))); @@ -11941,7 +12094,8 @@ aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED, temp1 = gen_rtx_REG (Pmode, EP1_REGNUM); if (vcall_offset == 0) - aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false); + aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, + 0, false); else { gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0); @@ -11954,7 +12108,7 @@ aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED, plus_constant (Pmode, this_rtx, delta)); else aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, - temp1, temp0, false); + temp1, temp0, 0, false); } if (Pmode == ptr_mode) @@ -31355,6 +31509,9 @@ aarch64_libgcc_floating_mode_supported_p #undef TARGET_EXTRA_LIVE_ON_ENTRY #define TARGET_EXTRA_LIVE_ON_ENTRY aarch64_extra_live_on_entry +#undef TARGET_USE_LATE_PROLOGUE_EPILOGUE +#define TARGET_USE_LATE_PROLOGUE_EPILOGUE aarch64_use_late_prologue_epilogue + #undef TARGET_EMIT_EPILOGUE_FOR_SIBCALL #define TARGET_EMIT_EPILOGUE_FOR_SIBCALL aarch64_expand_epilogue diff --git a/gcc/testsuite/g++.target/aarch64/sme/keyword_macros_1.C b/gcc/testsuite/g++.target/aarch64/sme/keyword_macros_1.C index 8b0755014cc..dc5c097bd52 100644 --- a/gcc/testsuite/g++.target/aarch64/sme/keyword_macros_1.C +++ b/gcc/testsuite/g++.target/aarch64/sme/keyword_macros_1.C @@ -7,3 +7,4 @@ void f4 () __arm_out("za"); void f5 () __arm_inout("za"); void f6 () __arm_preserves("za"); __arm_new("za") void f7 () {} +__arm_locally_streaming void f8 () {} diff --git a/gcc/testsuite/gcc.target/aarch64/sme/keyword_macros_1.c b/gcc/testsuite/gcc.target/aarch64/sme/keyword_macros_1.c index fcabe3edc55..22f5facfdf9 100644 --- a/gcc/testsuite/gcc.target/aarch64/sme/keyword_macros_1.c +++ b/gcc/testsuite/gcc.target/aarch64/sme/keyword_macros_1.c @@ -7,3 +7,4 @@ void f4 () __arm_out("za"); void f5 () __arm_inout("za"); void f6 () __arm_preserves("za"); __arm_new("za") void f7 () {} +__arm_locally_streaming void f8 () {} diff --git a/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_1.c b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_1.c new file mode 100644 index 00000000000..20ff4b87d94 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_1.c @@ -0,0 +1,466 @@ +// { dg-options "-O -fomit-frame-pointer" } +// { dg-final { check-function-bodies "**" "" } } + +void consume_za () [[arm::streaming, arm::inout("za")]]; + +/* +** n_ls: +** sub sp, sp, #?80 +** cntd x16 +** str x16, \[sp\] +** stp d8, d9, \[sp, #?16\] +** stp d10, d11, \[sp, #?32\] +** stp d12, d13, \[sp, #?48\] +** stp d14, d15, \[sp, #?64\] +** smstart sm +** smstop sm +** ldp d8, d9, \[sp, #?16\] +** ldp d10, d11, \[sp, #?32\] +** ldp d12, d13, \[sp, #?48\] +** ldp d14, d15, \[sp, #?64\] +** add sp, sp, #?80 +** ret +*/ +[[arm::locally_streaming]] void +n_ls () +{ + asm (""); +} + +/* +** s_ls: +** ret +*/ +[[arm::locally_streaming]] void +s_ls () [[arm::streaming]] +{ + asm (""); +} + +/* +** sc_ls: +** stp x29, x30, \[sp, #?-96\]! +** mov x29, sp +** cntd x16 +** str x16, \[sp, #?24\] +** stp d8, d9, \[sp, #?32\] +** stp d10, d11, \[sp, #?48\] +** stp d12, d13, \[sp, #?64\] +** stp d14, d15, \[sp, #?80\] +** mrs x16, svcr +** str x16, \[x29, #?16\] +** tbnz x16, 0, [^\n]+ +** smstart sm +** ldr x16, \[x29, #?16\] +** tbnz x16, 0, [^\n]+ +** smstop sm +** ldp d8, d9, \[sp, #?32\] +** ldp d10, d11, \[sp, #?48\] +** ldp d12, d13, \[sp, #?64\] +** ldp d14, d15, \[sp, #?80\] +** ldp x29, x30, \[sp\], #?96 +** ret +*/ +[[arm::locally_streaming]] void +sc_ls () [[arm::streaming_compatible]] +{ + asm (""); +} + +/* +** n_ls_new_za: +** str x30, \[sp, #?-80\]! +** cntd x16 +** str x16, \[sp, #?8\] +** stp d8, d9, \[sp, #?16\] +** stp d10, d11, \[sp, #?32\] +** stp d12, d13, \[sp, #?48\] +** stp d14, d15, \[sp, #?64\] +** smstart sm +** mrs (x[0-9]+), tpidr2_el0 +** cbz \1, [^\n]+ +** bl __arm_tpidr2_save +** msr tpidr2_el0, xzr +** zero { za } +** smstart za +** bl consume_za +** smstop za +** smstop sm +** ldp d8, d9, \[sp, #?16\] +** ldp d10, d11, \[sp, #?32\] +** ldp d12, d13, \[sp, #?48\] +** ldp d14, d15, \[sp, #?64\] +** ldr x30, \[sp\], #?80 +** ret +*/ +[[arm::locally_streaming, arm::new("za")]] void +n_ls_new_za () +{ + consume_za (); + asm (""); +} + +/* +** s_ls_new_za: +** str x30, \[sp, #?-16\]! +** mrs (x[0-9]+), tpidr2_el0 +** cbz \1, [^\n]+ +** bl __arm_tpidr2_save +** msr tpidr2_el0, xzr +** zero { za } +** smstart za +** bl consume_za +** smstop za +** ldr x30, \[sp\], #?16 +** ret +*/ +[[arm::locally_streaming, arm::new("za")]] void +s_ls_new_za () [[arm::streaming]] +{ + consume_za (); + asm (""); +} + +/* +** sc_ls_new_za: +** stp x29, x30, \[sp, #?-96\]! +** mov x29, sp +** cntd x16 +** str x16, \[sp, #?24\] +** stp d8, d9, \[sp, #?32\] +** stp d10, d11, \[sp, #?48\] +** stp d12, d13, \[sp, #?64\] +** stp d14, d15, \[sp, #?80\] +** mrs x16, svcr +** str x16, \[x29, #?16\] +** tbnz x16, 0, [^\n]+ +** smstart sm +** mrs (x[0-9]+), tpidr2_el0 +** cbz \1, [^\n]+ +** bl __arm_tpidr2_save +** msr tpidr2_el0, xzr +** zero { za } +** smstart za +** bl consume_za +** smstop za +** ldr x16, \[x29, #?16\] +** tbnz x16, 0, [^\n]+ +** smstop sm +** ldp d8, d9, \[sp, #?32\] +** ldp d10, d11, \[sp, #?48\] +** ldp d12, d13, \[sp, #?64\] +** ldp d14, d15, \[sp, #?80\] +** ldp x29, x30, \[sp\], #?96 +** ret +*/ +[[arm::locally_streaming, arm::new("za")]] void +sc_ls_new_za () [[arm::streaming_compatible]] +{ + consume_za (); + asm (""); +} + +/* +** n_ls_shared_za: +** str x30, \[sp, #?-80\]! +** cntd x16 +** str x16, \[sp, #?8\] +** stp d8, d9, \[sp, #?16\] +** stp d10, d11, \[sp, #?32\] +** stp d12, d13, \[sp, #?48\] +** stp d14, d15, \[sp, #?64\] +** smstart sm +** bl consume_za +** smstop sm +** ldp d8, d9, \[sp, #?16\] +** ldp d10, d11, \[sp, #?32\] +** ldp d12, d13, \[sp, #?48\] +** ldp d14, d15, \[sp, #?64\] +** ldr x30, \[sp\], #?80 +** ret +*/ +[[arm::locally_streaming]] void +n_ls_shared_za () [[arm::inout("za")]] +{ + consume_za (); + asm (""); +} + +/* +** s_ls_shared_za: +** str x30, \[sp, #?-16\]! +** bl consume_za +** ldr x30, \[sp\], #?16 +** ret +*/ +[[arm::locally_streaming]] void +s_ls_shared_za () [[arm::streaming, arm::inout("za")]] +{ + consume_za (); + asm (""); +} + +/* +** sc_ls_shared_za: +** stp x29, x30, \[sp, #?-96\]! +** mov x29, sp +** cntd x16 +** str x16, \[sp, #?24\] +** stp d8, d9, \[sp, #?32\] +** stp d10, d11, \[sp, #?48\] +** stp d12, d13, \[sp, #?64\] +** stp d14, d15, \[sp, #?80\] +** mrs x16, svcr +** str x16, \[x29, #?16\] +** tbnz x16, 0, [^\n]+ +** smstart sm +** bl consume_za +** ldr x16, \[x29, #?16\] +** tbnz x16, 0, [^\n]+ +** smstop sm +** ldp d8, d9, \[sp, #?32\] +** ldp d10, d11, \[sp, #?48\] +** ldp d12, d13, \[sp, #?64\] +** ldp d14, d15, \[sp, #?80\] +** ldp x29, x30, \[sp\], #?96 +** ret +*/ +[[arm::locally_streaming]] void +sc_ls_shared_za () [[arm::streaming_compatible, arm::inout("za")]] +{ + consume_za (); + asm (""); +} + +/* +** n_ls_vector_pcs: +** sub sp, sp, #?272 +** cntd x16 +** str x16, \[sp\] +** stp q8, q9, \[sp, #?16\] +** stp q10, q11, \[sp, #?48\] +** stp q12, q13, \[sp, #?80\] +** stp q14, q15, \[sp, #?112\] +** stp q16, q17, \[sp, #?144\] +** stp q18, q19, \[sp, #?176\] +** stp q20, q21, \[sp, #?208\] +** stp q22, q23, \[sp, #?240\] +** smstart sm +** smstop sm +** ldp q8, q9, \[sp, #?16\] +** ldp q10, q11, \[sp, #?48\] +** ldp q12, q13, \[sp, #?80\] +** ldp q14, q15, \[sp, #?112\] +** ldp q16, q17, \[sp, #?144\] +** ldp q18, q19, \[sp, #?176\] +** ldp q20, q21, \[sp, #?208\] +** ldp q22, q23, \[sp, #?240\] +** add sp, sp, #?272 +** ret +*/ +[[arm::locally_streaming]] void __attribute__((aarch64_vector_pcs)) +n_ls_vector_pcs () +{ + asm (""); +} + +/* +** n_ls_sve_pcs: +** sub sp, sp, #?16 +** cntd x16 +** str x16, \[sp\] +** addsvl sp, sp, #-18 +** str p4, \[sp\] +** str p5, \[sp, #1, mul vl\] +** str p6, \[sp, #2, mul vl\] +** str p7, \[sp, #3, mul vl\] +** str p8, \[sp, #4, mul vl\] +** str p9, \[sp, #5, mul vl\] +** str p10, \[sp, #6, mul vl\] +** str p11, \[sp, #7, mul vl\] +** str p12, \[sp, #8, mul vl\] +** str p13, \[sp, #9, mul vl\] +** str p14, \[sp, #10, mul vl\] +** str p15, \[sp, #11, mul vl\] +** str z8, \[sp, #2, mul vl\] +** str z9, \[sp, #3, mul vl\] +** str z10, \[sp, #4, mul vl\] +** str z11, \[sp, #5, mul vl\] +** str z12, \[sp, #6, mul vl\] +** str z13, \[sp, #7, mul vl\] +** str z14, \[sp, #8, mul vl\] +** str z15, \[sp, #9, mul vl\] +** str z16, \[sp, #10, mul vl\] +** str z17, \[sp, #11, mul vl\] +** str z18, \[sp, #12, mul vl\] +** str z19, \[sp, #13, mul vl\] +** str z20, \[sp, #14, mul vl\] +** str z21, \[sp, #15, mul vl\] +** str z22, \[sp, #16, mul vl\] +** str z23, \[sp, #17, mul vl\] +** addvl sp, sp, #-1 +** str p0, \[sp\] +** smstart sm +** ldr p0, \[sp\] +** addvl sp, sp, #1 +** smstop sm +** ldr z8, \[sp, #2, mul vl\] +** ldr z9, \[sp, #3, mul vl\] +** ldr z10, \[sp, #4, mul vl\] +** ldr z11, \[sp, #5, mul vl\] +** ldr z12, \[sp, #6, mul vl\] +** ldr z13, \[sp, #7, mul vl\] +** ldr z14, \[sp, #8, mul vl\] +** ldr z15, \[sp, #9, mul vl\] +** ldr z16, \[sp, #10, mul vl\] +** ldr z17, \[sp, #11, mul vl\] +** ldr z18, \[sp, #12, mul vl\] +** ldr z19, \[sp, #13, mul vl\] +** ldr z20, \[sp, #14, mul vl\] +** ldr z21, \[sp, #15, mul vl\] +** ldr z22, \[sp, #16, mul vl\] +** ldr z23, \[sp, #17, mul vl\] +** ldr p4, \[sp\] +** ldr p5, \[sp, #1, mul vl\] +** ldr p6, \[sp, #2, mul vl\] +** ldr p7, \[sp, #3, mul vl\] +** ldr p8, \[sp, #4, mul vl\] +** ldr p9, \[sp, #5, mul vl\] +** ldr p10, \[sp, #6, mul vl\] +** ldr p11, \[sp, #7, mul vl\] +** ldr p12, \[sp, #8, mul vl\] +** ldr p13, \[sp, #9, mul vl\] +** ldr p14, \[sp, #10, mul vl\] +** ldr p15, \[sp, #11, mul vl\] +** addsvl sp, sp, #18 +** add sp, sp, #?16 +** ret +*/ +[[arm::locally_streaming]] void +n_ls_sve_pcs (__SVBool_t x) +{ + asm (""); +} + +/* +** n_ls_v0: +** addsvl sp, sp, #-1 +** ... +** smstart sm +** add x[0-9]+, [^\n]+ +** smstop sm +** ... +** addsvl sp, sp, #1 +** ... +*/ +#define TEST(VN) __SVInt32_t VN; asm ("" :: "r" (&VN)); +[[arm::locally_streaming]] void +n_ls_v0 () +{ + TEST (v0); +} + +/* +** n_ls_v32: +** addsvl sp, sp, #-32 +** ... +** smstart sm +** ... +** smstop sm +** ... +** rdsvl (x[0-9]+), #1 +** lsl (x[0-9]+), \1, #?5 +** add sp, sp, \2 +** ... +*/ +[[arm::locally_streaming]] void +n_ls_v32 () +{ + TEST (v0); + TEST (v1); + TEST (v2); + TEST (v3); + TEST (v4); + TEST (v5); + TEST (v6); + TEST (v7); + TEST (v8); + TEST (v9); + TEST (v10); + TEST (v11); + TEST (v12); + TEST (v13); + TEST (v14); + TEST (v15); + TEST (v16); + TEST (v17); + TEST (v18); + TEST (v19); + TEST (v20); + TEST (v21); + TEST (v22); + TEST (v23); + TEST (v24); + TEST (v25); + TEST (v26); + TEST (v27); + TEST (v28); + TEST (v29); + TEST (v30); + TEST (v31); +} + +/* +** n_ls_v33: +** rdsvl (x[0-9]+), #1 +** mov (x[0-9]+), #?33 +** mul (x[0-9]+), (?:\1, \2|\2, \1) +** sub sp, sp, \3 +** ... +** smstart sm +** ... +** smstop sm +** ... +** rdsvl (x[0-9]+), #1 +** mov (x[0-9]+), #?33 +** mul (x[0-9]+), (?:\4, \5|\5, \4) +** add sp, sp, \6 +** ... +*/ +[[arm::locally_streaming]] void +n_ls_v33 () +{ + TEST (v0); + TEST (v1); + TEST (v2); + TEST (v3); + TEST (v4); + TEST (v5); + TEST (v6); + TEST (v7); + TEST (v8); + TEST (v9); + TEST (v10); + TEST (v11); + TEST (v12); + TEST (v13); + TEST (v14); + TEST (v15); + TEST (v16); + TEST (v17); + TEST (v18); + TEST (v19); + TEST (v20); + TEST (v21); + TEST (v22); + TEST (v23); + TEST (v24); + TEST (v25); + TEST (v26); + TEST (v27); + TEST (v28); + TEST (v29); + TEST (v30); + TEST (v31); + TEST (v32); +} diff --git a/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_2.c b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_2.c new file mode 100644 index 00000000000..0eba993855f --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_2.c @@ -0,0 +1,177 @@ +// { dg-options "-O -fomit-frame-pointer" } +// { dg-final { check-function-bodies "**" "" } } + +#include <arm_neon.h> +#include <arm_sve.h> + +/* +** test_d0: +** ... +** smstart sm +** ... +** fmov x10, d0 +** smstop sm +** fmov d0, x10 +** ... +*/ +[[arm::locally_streaming]] double +test_d0 () +{ + asm (""); + return 1.0f; +} + +/* +** test_d0_vec: +** ... +** smstart sm +** ... +** ( +** fmov x10, d0 +** | +** umov x10, v0.d\[0\] +** ) +** smstop sm +** fmov d0, x10 +** ... +*/ +[[arm::locally_streaming]] int8x8_t +test_d0_vec () +{ + asm (""); + return (int8x8_t) {}; +} + +/* +** test_q0: +** ... +** smstart sm +** ... +** str q0, \[sp, #?-16\]! +** smstop sm +** ldr q0, \[sp\], #?16 +** ... +*/ +[[arm::locally_streaming]] int8x16_t +test_q0 () +{ + asm (""); + return (int8x16_t) {}; +} + +/* +** test_q1: +** ... +** smstart sm +** ... +** stp q0, q1, \[sp, #?-32\]! +** smstop sm +** ldp q0, q1, \[sp\], #?32 +** ... +*/ +[[arm::locally_streaming]] int8x16x2_t +test_q1 () +{ + asm (""); + return (int8x16x2_t) {}; +} + +/* +** test_q2: +** ... +** smstart sm +** ... +** stp q0, q1, \[sp, #?-48\]! +** str q2, \[sp, #?32\] +** smstop sm +** ldr q2, \[sp, #?32\] +** ldp q0, q1, \[sp\], #?48 +** ... +*/ +[[arm::locally_streaming]] int8x16x3_t +test_q2 () +{ + asm (""); + return (int8x16x3_t) {}; +} + +/* +** test_q3: +** ... +** smstart sm +** ... +** stp q0, q1, \[sp, #?-64\]! +** stp q2, q3, \[sp, #?32\] +** smstop sm +** ldp q2, q3, \[sp, #?32\] +** ldp q0, q1, \[sp\], #?64 +** ... +*/ +[[arm::locally_streaming]] int8x16x4_t +test_q3 () +{ + asm (""); + return (int8x16x4_t) {}; +} + +/* +** test_z0: +** ... +** smstart sm +** mov z0\.b, #0 +** addvl sp, sp, #-1 +** str z0, \[sp\] +** smstop sm +** ldr z0, \[sp\] +** addvl sp, sp, #1 +** ... +*/ +[[arm::locally_streaming]] svint8_t +test_z0 () +{ + asm (""); + return (svint8_t) {}; +} + +/* +** test_z3: +** ... +** smstart sm +** ... +** addvl sp, sp, #-4 +** str z0, \[sp\] +** str z1, \[sp, #1, mul vl\] +** str z2, \[sp, #2, mul vl\] +** str z3, \[sp, #3, mul vl\] +** smstop sm +** ldr z0, \[sp\] +** ldr z1, \[sp, #1, mul vl\] +** ldr z2, \[sp, #2, mul vl\] +** ldr z3, \[sp, #3, mul vl\] +** ... +*/ +[[arm::locally_streaming]] svint8x4_t +test_z3 () +{ + asm (""); + return (svint8x4_t) {}; +} + +/* +** test_p0: +** ... +** smstart sm +** pfalse p0\.b +** addvl sp, sp, #-1 +** str p0, \[sp\] +** smstop sm +** ldr p0, \[sp\] +** addvl sp, sp, #1 +** ... +*/ +[[arm::locally_streaming]] svbool_t +test_p0 () +{ + asm (""); + return (svbool_t) {}; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_3.c b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_3.c new file mode 100644 index 00000000000..2bdea6ac631 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_3.c @@ -0,0 +1,273 @@ +// { dg-options "-O -fomit-frame-pointer" } +// { dg-final { check-function-bodies "**" "" } } + +#include <arm_neon.h> +#include <arm_sve.h> + +/* +** test_d0: +** ... +** fmov x10, d0 +** smstart sm +** fmov d0, x10 +** smstop sm +** ... +*/ +[[arm::locally_streaming]] void +test_d0 (double d0) +{ + asm (""); +} + +/* +** test_d7: +** ... +** fmov x10, d0 +** fmov x11, d1 +** fmov x12, d2 +** fmov x13, d3 +** fmov x14, d4 +** fmov x15, d5 +** fmov x16, d6 +** fmov x17, d7 +** smstart sm +** fmov d0, x10 +** fmov d1, x11 +** fmov d2, x12 +** fmov d3, x13 +** fmov d4, x14 +** fmov d5, x15 +** fmov d6, x16 +** fmov d7, x17 +** smstop sm +** ... +*/ +[[arm::locally_streaming]] void +test_d7 (double d0, double d1, double d2, double d3, + double d4, double d5, double d6, double d7) +{ + asm (""); +} + +/* +** test_d0_vec: +** ... +** ( +** fmov x10, d0 +** | +** umov x10, v0.d\[0\] +** ) +** smstart sm +** fmov d0, x10 +** smstop sm +** ... +*/ +[[arm::locally_streaming]] void +test_d0_vec (int8x8_t d0) +{ + asm (""); +} + +/* +** test_d7_vec: +** ... +** ( +** fmov x10, d0 +** fmov x11, d1 +** fmov x12, d2 +** fmov x13, d3 +** fmov x14, d4 +** fmov x15, d5 +** fmov x16, d6 +** fmov x17, d7 +** | +** umov x10, v0.d\[0\] +** umov x11, v1.d\[0\] +** umov x12, v2.d\[0\] +** umov x13, v3.d\[0\] +** umov x14, v4.d\[0\] +** umov x15, v5.d\[0\] +** umov x16, v6.d\[0\] +** umov x17, v7.d\[0\] +** ) +** smstart sm +** fmov d0, x10 +** fmov d1, x11 +** fmov d2, x12 +** fmov d3, x13 +** fmov d4, x14 +** fmov d5, x15 +** fmov d6, x16 +** fmov d7, x17 +** smstop sm +** ... +*/ +[[arm::locally_streaming]] void +test_d7_vec (int8x8_t d0, int8x8_t d1, int8x8_t d2, int8x8_t d3, + int8x8_t d4, int8x8_t d5, int8x8_t d6, int8x8_t d7) +{ + asm (""); +} + +/* +** test_q0: +** ... +** str q0, \[sp, #?-16\]! +** smstart sm +** ldr q0, \[sp\], #?16 +** smstop sm +** ... +*/ +[[arm::locally_streaming]] void +test_q0 (int8x16_t q0) +{ + asm (""); +} + +/* +** test_q7: +** ... +** stp q0, q1, \[sp, #?-128\]! +** stp q2, q3, \[sp, #?32\] +** stp q4, q5, \[sp, #?64\] +** stp q6, q7, \[sp, #?96\] +** smstart sm +** ldp q2, q3, \[sp, #?32\] +** ldp q4, q5, \[sp, #?64\] +** ldp q6, q7, \[sp, #?96\] +** ldp q0, q1, \[sp\], #?128 +** smstop sm +** ... +*/ +[[arm::locally_streaming]] void +test_q7 (int8x16x4_t q0, int8x16x4_t q4) +{ + asm (""); +} + +/* +** test_z0: +** ... +** addvl sp, sp, #-1 +** str z0, \[sp\] +** smstart sm +** ldr z0, \[sp\] +** addvl sp, sp, #1 +** smstop sm +** ... +*/ +[[arm::locally_streaming]] void +test_z0 (svint8_t z0) +{ + asm (""); +} + +/* +** test_z7: +** ... +** addvl sp, sp, #-8 +** str z0, \[sp\] +** str z1, \[sp, #1, mul vl\] +** str z2, \[sp, #2, mul vl\] +** str z3, \[sp, #3, mul vl\] +** str z4, \[sp, #4, mul vl\] +** str z5, \[sp, #5, mul vl\] +** str z6, \[sp, #6, mul vl\] +** str z7, \[sp, #7, mul vl\] +** smstart sm +** ldr z0, \[sp\] +** ldr z1, \[sp, #1, mul vl\] +** ldr z2, \[sp, #2, mul vl\] +** ldr z3, \[sp, #3, mul vl\] +** ldr z4, \[sp, #4, mul vl\] +** ldr z5, \[sp, #5, mul vl\] +** ldr z6, \[sp, #6, mul vl\] +** ldr z7, \[sp, #7, mul vl\] +** addvl sp, sp, #8 +** smstop sm +** ... +*/ +[[arm::locally_streaming]] void +test_z7 (svint8x4_t z0, svint8x4_t z4) +{ + asm (""); +} + +/* +** test_p0: +** ... +** addvl sp, sp, #-1 +** str p0, \[sp\] +** smstart sm +** ldr p0, \[sp\] +** addvl sp, sp, #1 +** smstop sm +** ... +*/ +[[arm::locally_streaming]] void +test_p0 (svbool_t p0) +{ + asm (""); +} + +/* +** test_p3: +** ... +** addvl sp, sp, #-1 +** str p0, \[sp\] +** str p1, \[sp, #1, mul vl\] +** str p2, \[sp, #2, mul vl\] +** str p3, \[sp, #3, mul vl\] +** smstart sm +** ldr p0, \[sp\] +** ldr p1, \[sp, #1, mul vl\] +** ldr p2, \[sp, #2, mul vl\] +** ldr p3, \[sp, #3, mul vl\] +** addvl sp, sp, #1 +** smstop sm +** ... +*/ +[[arm::locally_streaming]] void +test_p3 (svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3) +{ + asm (""); +} + +/* +** test_mixed: +** ... +** addvl sp, sp, #-3 +** str p0, \[sp\] +** str p1, \[sp, #1, mul vl\] +** str p2, \[sp, #2, mul vl\] +** str p3, \[sp, #3, mul vl\] +** str z3, \[sp, #1, mul vl\] +** str z7, \[sp, #2, mul vl\] +** stp q2, q6, \[sp, #?-32\]! +** fmov w10, s0 +** fmov x11, d1 +** fmov w12, s4 +** fmov x13, d5 +** smstart sm +** fmov s0, w10 +** fmov d1, x11 +** fmov s4, w12 +** fmov d5, x13 +** ldp q2, q6, \[sp\], #?32 +** ldr p0, \[sp\] +** ldr p1, \[sp, #1, mul vl\] +** ldr p2, \[sp, #2, mul vl\] +** ldr p3, \[sp, #3, mul vl\] +** ldr z3, \[sp, #1, mul vl\] +** ldr z7, \[sp, #2, mul vl\] +** addvl sp, sp, #3 +** smstop sm +** ... +*/ +[[arm::locally_streaming]] void +test_mixed (float s0, double d1, float32x4_t q2, svfloat32_t z3, + float s4, double d5, float64x2_t q6, svfloat64_t z7, + svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3) +{ + asm (""); +} diff --git a/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_4.c b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_4.c new file mode 100644 index 00000000000..42adeb152e9 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_4.c @@ -0,0 +1,145 @@ +// { dg-options "-O -fomit-frame-pointer" } +/* { dg-final { check-function-bodies "**" "" } } */ + +#include <arm_neon.h> +#include <arm_sve.h> + +/* +** test_d0: +** ... +** smstart sm +** ... +** fmov x10, d0 +** smstop sm +** fmov d0, x10 +** ... +** smstart sm +** ... +** smstop sm +** ... +*/ +void consume_d0 (double d0); + +__arm_locally_streaming void +test_d0 () +{ + asm (""); + consume_d0 (1.0); + asm (""); +} + +/* +** test_d7: +** ... +** fmov x10, d0 +** fmov x11, d1 +** fmov x12, d2 +** fmov x13, d3 +** fmov x14, d4 +** fmov x15, d5 +** fmov x16, d6 +** fmov x17, d7 +** smstop sm +** fmov d0, x10 +** fmov d1, x11 +** fmov d2, x12 +** fmov d3, x13 +** fmov d4, x14 +** fmov d5, x15 +** fmov d6, x16 +** fmov d7, x17 +** ... +*/ +void consume_d7 (double d0, double d1, double d2, double d3, + double d4, double d5, double d6, double d7); +__arm_locally_streaming void +test_d7 () +{ + asm (""); + consume_d7 (1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); + asm (""); +} + +/* +** test_q7: +** ... +** stp q0, q1, \[sp, #?-128\]! +** stp q2, q3, \[sp, #?32\] +** stp q4, q5, \[sp, #?64\] +** stp q6, q7, \[sp, #?96\] +** smstop sm +** ldp q2, q3, \[sp, #?32\] +** ldp q4, q5, \[sp, #?64\] +** ldp q6, q7, \[sp, #?96\] +** ldp q0, q1, \[sp\], #?128 +** ... +*/ +void consume_q7 (int8x16x4_t q0, int8x16x4_t q4); + +__arm_locally_streaming void +test_q7 (int8x16x4_t *ptr) +{ + asm (""); + consume_q7 (ptr[0], ptr[1]); + asm (""); +} + +/* +** test_z7: +** ... +** addvl sp, sp, #-8 +** str z0, \[sp\] +** str z1, \[sp, #1, mul vl\] +** str z2, \[sp, #2, mul vl\] +** str z3, \[sp, #3, mul vl\] +** str z4, \[sp, #4, mul vl\] +** str z5, \[sp, #5, mul vl\] +** str z6, \[sp, #6, mul vl\] +** str z7, \[sp, #7, mul vl\] +** smstop sm +** ldr z0, \[sp\] +** ldr z1, \[sp, #1, mul vl\] +** ldr z2, \[sp, #2, mul vl\] +** ldr z3, \[sp, #3, mul vl\] +** ldr z4, \[sp, #4, mul vl\] +** ldr z5, \[sp, #5, mul vl\] +** ldr z6, \[sp, #6, mul vl\] +** ldr z7, \[sp, #7, mul vl\] +** addvl sp, sp, #8 +** ... +*/ +void consume_z7 (svint8x4_t z0, svint8x4_t z4); + +__arm_locally_streaming void +test_z7 (svint8x4_t *ptr1, svint8x4_t *ptr2) +{ + asm (""); + consume_z7 (*ptr1, *ptr2); + asm (""); +} + +/* +** test_p3: +** ... +** addvl sp, sp, #-1 +** str p0, \[sp\] +** str p1, \[sp, #1, mul vl\] +** str p2, \[sp, #2, mul vl\] +** str p3, \[sp, #3, mul vl\] +** smstop sm +** ldr p0, \[sp\] +** ldr p1, \[sp, #1, mul vl\] +** ldr p2, \[sp, #2, mul vl\] +** ldr p3, \[sp, #3, mul vl\] +** addvl sp, sp, #1 +** ... +*/ +void consume_p3 (svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3); + +__arm_locally_streaming void +test_p3 (svbool_t *ptr1, svbool_t *ptr2, svbool_t *ptr3, svbool_t *ptr4) +{ + asm (""); + consume_p3 (*ptr1, *ptr2, *ptr3, *ptr4); + asm (""); +} -- 2.25.1