This patch adds support for the __arm_locally_streaming attribute,
which allows a function to use SME internally without changing
the function's ABI.  The attribute is valid but redundant for
__arm_streaming functions.

gcc/
        * config/aarch64/aarch64.cc (aarch64_arm_attribute_table): Add
        arm::locally_streaming.
        (aarch64_fndecl_is_locally_streaming): New function.
        (aarch64_fndecl_sm_state): Handle locally-streaming functions.
        (aarch64_cfun_enables_pstate_sm): New function.
        (aarch64_add_offset): Add an argument that specifies whether
        the streaming vector length should be used instead of the
        prevailing one.
        (aarch64_split_add_offset, aarch64_add_sp, aarch64_sub_sp): Likewise.
        (aarch64_allocate_and_probe_stack_space): Likewise.
        (aarch64_expand_mov_immediate): Update calls accordingly.
        (aarch64_need_old_pstate_sm): Return true for locally-streaming
        streaming-compatible functions.
        (aarch64_layout_frame): Force all call-preserved Z and P registers
        to be saved and restored if the function switches PSTATE.SM in the
        prologue.
        (aarch64_get_separate_components): Disable shrink-wrapping of
        such Z and P saves and restores.
        (aarch64_use_late_prologue_epilogue): New function.
        (aarch64_expand_prologue): Measure SVE lengths in the streaming
        vector length for locally-streaming functions, then emit code
        to enable streaming mode.
        (aarch64_expand_epilogue): Likewise in reverse.
        (TARGET_USE_LATE_PROLOGUE_EPILOGUE): Define.
        * config/aarch64/aarch64-c.cc (aarch64_define_unconditional_macros):
        Define __arm_locally_streaming.

gcc/testsuite/
        * gcc.target/aarch64/sme/locally_streaming_1.c: New test.
        * gcc.target/aarch64/sme/locally_streaming_2.c: Likewise.
        * gcc.target/aarch64/sme/locally_streaming_3.c: Likewise.
        * gcc.target/aarch64/sme/locally_streaming_4.c: Likewise.
        * gcc.target/aarch64/sme/keyword_macros_1.c: Add
        __arm_locally_streaming.
        * g++.target/aarch64/sme/keyword_macros_1.C: Likewise.
---
 gcc/config/aarch64/aarch64-c.cc               |   1 +
 gcc/config/aarch64/aarch64.cc                 | 233 +++++++--
 .../g++.target/aarch64/sme/keyword_macros_1.C |   1 +
 .../gcc.target/aarch64/sme/keyword_macros_1.c |   1 +
 .../aarch64/sme/locally_streaming_1.c         | 466 ++++++++++++++++++
 .../aarch64/sme/locally_streaming_2.c         | 177 +++++++
 .../aarch64/sme/locally_streaming_3.c         | 273 ++++++++++
 .../aarch64/sme/locally_streaming_4.c         | 145 ++++++
 8 files changed, 1259 insertions(+), 38 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_3.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_4.c

diff --git a/gcc/config/aarch64/aarch64-c.cc b/gcc/config/aarch64/aarch64-c.cc
index f2fa5df1b82..2a8ca46987a 100644
--- a/gcc/config/aarch64/aarch64-c.cc
+++ b/gcc/config/aarch64/aarch64-c.cc
@@ -86,6 +86,7 @@ aarch64_define_unconditional_macros (cpp_reader *pfile)
 
   DEFINE_ARM_KEYWORD_MACRO ("streaming");
   DEFINE_ARM_KEYWORD_MACRO ("streaming_compatible");
+  DEFINE_ARM_KEYWORD_MACRO ("locally_streaming");
 
 #undef DEFINE_ARM_KEYWORD_MACRO
 
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 12753ac133e..6ad29a3a84f 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -3136,6 +3136,7 @@ static const attribute_spec aarch64_arm_attributes[] =
                          NULL, attr_streaming_exclusions },
   { "streaming_compatible", 0, 0, false, true,  true,  true,
                          NULL, attr_streaming_exclusions },
+  { "locally_streaming",  0, 0, true, false, false, false, NULL, NULL },
   { "new",               1, -1, true, false, false, false,
                          handle_arm_new, NULL },
   { "preserves",         1, -1, false, true,  true,  true,
@@ -4445,6 +4446,16 @@ aarch64_fntype_isa_mode (const_tree fntype)
          | aarch64_fntype_pstate_za (fntype));
 }
 
+/* Return true if FNDECL uses streaming mode internally, as an
+   implementation choice.  */
+
+static bool
+aarch64_fndecl_is_locally_streaming (const_tree fndecl)
+{
+  return lookup_attribute ("arm", "locally_streaming",
+                          DECL_ATTRIBUTES (fndecl));
+}
+
 /* Return the state of PSTATE.SM when compiling the body of
    function FNDECL.  This might be different from the state of
    PSTATE.SM on entry.  */
@@ -4452,6 +4463,9 @@ aarch64_fntype_isa_mode (const_tree fntype)
 static aarch64_feature_flags
 aarch64_fndecl_pstate_sm (const_tree fndecl)
 {
+  if (aarch64_fndecl_is_locally_streaming (fndecl))
+    return AARCH64_FL_SM_ON;
+
   return aarch64_fntype_pstate_sm (TREE_TYPE (fndecl));
 }
 
@@ -4527,6 +4541,16 @@ aarch64_cfun_has_new_state (const char *state_name)
   return aarch64_fndecl_has_new_state (cfun->decl, state_name);
 }
 
+/* Return true if PSTATE.SM is 1 in the body of the current function,
+   but is not guaranteed to be 1 on entry.  */
+
+static bool
+aarch64_cfun_enables_pstate_sm ()
+{
+  return (aarch64_fndecl_is_locally_streaming (cfun->decl)
+         && aarch64_cfun_incoming_pstate_sm () != AARCH64_FL_SM_ON);
+}
+
 /* Return true if the current function has state STATE_NAME, either by
    creating new state itself or by sharing state with callers.  */
 
@@ -6768,6 +6792,10 @@ aarch64_add_offset_temporaries (rtx x)
    TEMP2, if nonnull, is a second temporary register that doesn't
    overlap either DEST or REG.
 
+   FORCE_ISA_MODE is AARCH64_FL_SM_ON if any variable component of OFFSET
+   is measured relative to the SME vector length instead of the current
+   prevailing vector length.  It is 0 otherwise.
+
    Since this function may be used to adjust the stack pointer, we must
    ensure that it cannot cause transient stack deallocation (for example
    by first incrementing SP and then decrementing when adjusting by a
@@ -6776,6 +6804,7 @@ aarch64_add_offset_temporaries (rtx x)
 static void
 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
                    poly_int64 offset, rtx temp1, rtx temp2,
+                   aarch64_feature_flags force_isa_mode,
                    bool frame_related_p, bool emit_move_imm = true)
 {
   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
@@ -6788,9 +6817,18 @@ aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx 
src,
   /* Try using ADDVL or ADDPL to add the whole value.  */
   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
     {
-      rtx offset_rtx = gen_int_mode (offset, mode);
+      gcc_assert (offset.coeffs[0] == offset.coeffs[1]);
+      rtx offset_rtx;
+      if (force_isa_mode == 0)
+       offset_rtx = gen_int_mode (offset, mode);
+      else
+       offset_rtx = aarch64_sme_vq_immediate (mode, offset.coeffs[0], 0);
       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
       RTX_FRAME_RELATED_P (insn) = frame_related_p;
+      if (frame_related_p && (force_isa_mode & AARCH64_FL_SM_ON))
+       add_reg_note (insn, REG_CFA_ADJUST_CFA,
+                     gen_rtx_SET (dest, plus_constant (Pmode, src,
+                                                       offset)));
       return;
     }
 
@@ -6806,11 +6844,19 @@ aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx 
src,
   if (src != const0_rtx
       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
     {
-      rtx offset_rtx = gen_int_mode (poly_offset, mode);
+      rtx offset_rtx;
+      if (force_isa_mode == 0)
+       offset_rtx = gen_int_mode (poly_offset, mode);
+      else
+       offset_rtx = aarch64_sme_vq_immediate (mode, factor, 0);
       if (frame_related_p)
        {
          rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
          RTX_FRAME_RELATED_P (insn) = true;
+         if (force_isa_mode & AARCH64_FL_SM_ON)
+           add_reg_note (insn, REG_CFA_ADJUST_CFA,
+                         gen_rtx_SET (dest, plus_constant (Pmode, src,
+                                                           poly_offset)));
          src = dest;
        }
       else
@@ -6841,9 +6887,19 @@ aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx 
src,
       rtx val;
       if (IN_RANGE (rel_factor, -32, 31))
        {
+         if (force_isa_mode & AARCH64_FL_SM_ON)
+           {
+             /* Try to use an unshifted RDSVL, otherwise fall back on
+                a shifted RDSVL #1.  */
+             if (aarch64_sve_rdvl_addvl_factor_p (factor))
+               shift = 0;
+             else
+               factor = rel_factor * 16;
+             val = aarch64_sme_vq_immediate (mode, factor, 0);
+           }
          /* Try to use an unshifted CNT[BHWD] or RDVL.  */
-         if (aarch64_sve_cnt_factor_p (factor)
-             || aarch64_sve_rdvl_addvl_factor_p (factor))
+         else if (aarch64_sve_cnt_factor_p (factor)
+                  || aarch64_sve_rdvl_addvl_factor_p (factor))
            {
              val = gen_int_mode (poly_int64 (factor, factor), mode);
              shift = 0;
@@ -6873,11 +6929,18 @@ aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx 
src,
             a shift and add sequence for the multiplication.
             If CNTB << SHIFT is out of range, stick with the current
             shift factor.  */
-         if (IN_RANGE (low_bit, 2, 16 * 16))
+         if (force_isa_mode == 0
+             && IN_RANGE (low_bit, 2, 16 * 16))
            {
              val = gen_int_mode (poly_int64 (low_bit, low_bit), mode);
              shift = 0;
            }
+         else if ((force_isa_mode & AARCH64_FL_SM_ON)
+                  && aarch64_sve_rdvl_addvl_factor_p (low_bit))
+           {
+             val = aarch64_sme_vq_immediate (mode, low_bit, 0);
+             shift = 0;
+           }
          else
            val = gen_int_mode (BYTES_PER_SVE_VECTOR, mode);
 
@@ -6965,30 +7028,34 @@ aarch64_split_add_offset (scalar_int_mode mode, rtx 
dest, rtx src,
                          rtx offset_rtx, rtx temp1, rtx temp2)
 {
   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
-                     temp1, temp2, false);
+                     temp1, temp2, 0, false);
 }
 
 /* Add DELTA to the stack pointer, marking the instructions frame-related.
-   TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
-   if TEMP1 already contains abs (DELTA).  */
+   TEMP1 is available as a temporary if nonnull.  FORCE_ISA_MODE is as
+   for aarch64_add_offset.  EMIT_MOVE_IMM is false if TEMP1 already
+   contains abs (DELTA).  */
 
 static inline void
-aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
+aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta,
+               aarch64_feature_flags force_isa_mode, bool emit_move_imm)
 {
   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
-                     temp1, temp2, true, emit_move_imm);
+                     temp1, temp2, force_isa_mode, true, emit_move_imm);
 }
 
 /* Subtract DELTA from the stack pointer, marking the instructions
-   frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
-   if nonnull.  */
+   frame-related if FRAME_RELATED_P.  FORCE_ISA_MODE is as for
+   aarch64_add_offset.  TEMP1 is available as a temporary if nonnull.  */
 
 static inline void
-aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
-               bool emit_move_imm = true)
+aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta,
+               aarch64_feature_flags force_isa_mode,
+               bool frame_related_p, bool emit_move_imm = true)
 {
   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
-                     temp1, temp2, frame_related_p, emit_move_imm);
+                     temp1, temp2, force_isa_mode, frame_related_p,
+                     emit_move_imm);
 }
 
 /* A streaming-compatible function needs to switch temporarily to the known
@@ -8014,11 +8081,11 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm)
                {
                  base = aarch64_force_temporary (int_mode, dest, base);
                  aarch64_add_offset (int_mode, dest, base, offset,
-                                     NULL_RTX, NULL_RTX, false);
+                                     NULL_RTX, NULL_RTX, 0, false);
                }
              else
                aarch64_add_offset (int_mode, dest, base, offset,
-                                   dest, NULL_RTX, false);
+                                   dest, NULL_RTX, 0, false);
            }
          return;
        }
@@ -8045,7 +8112,7 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm)
              gcc_assert (can_create_pseudo_p ());
              base = aarch64_force_temporary (int_mode, dest, base);
              aarch64_add_offset (int_mode, dest, base, const_offset,
-                                 NULL_RTX, NULL_RTX, false);
+                                 NULL_RTX, NULL_RTX, 0, false);
              return;
            }
 
@@ -8085,7 +8152,7 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm)
              gcc_assert(can_create_pseudo_p ());
              base = aarch64_force_temporary (int_mode, dest, base);
              aarch64_add_offset (int_mode, dest, base, const_offset,
-                                 NULL_RTX, NULL_RTX, false);
+                                 NULL_RTX, NULL_RTX, 0, false);
              return;
            }
          /* FALLTHRU */
@@ -9728,6 +9795,9 @@ aarch64_need_old_pstate_sm ()
   if (aarch64_cfun_incoming_pstate_sm () != 0)
     return false;
 
+  if (aarch64_cfun_enables_pstate_sm ())
+    return true;
+
   if (cfun->machine->call_switches_pstate_sm)
     for (auto insn = get_insns (); insn; insn = NEXT_INSN (insn))
       if (auto *call = dyn_cast<rtx_call_insn *> (insn))
@@ -9754,6 +9824,7 @@ aarch64_layout_frame (void)
   bool frame_related_fp_reg_p = false;
   aarch64_frame &frame = cfun->machine->frame;
   poly_int64 top_of_locals = -1;
+  bool enables_pstate_sm = aarch64_cfun_enables_pstate_sm ();
 
   vec_safe_truncate (frame.saved_gprs, 0);
   vec_safe_truncate (frame.saved_fprs, 0);
@@ -9791,7 +9862,7 @@ aarch64_layout_frame (void)
       frame.reg_offset[regno] = SLOT_REQUIRED;
 
   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
-    if (df_regs_ever_live_p (regno)
+    if ((enables_pstate_sm || df_regs_ever_live_p (regno))
        && !fixed_regs[regno]
        && !crtl->abi->clobbers_full_reg_p (regno))
       {
@@ -9820,7 +9891,7 @@ aarch64_layout_frame (void)
     }
 
   for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
-    if (df_regs_ever_live_p (regno)
+    if ((enables_pstate_sm || df_regs_ever_live_p (regno))
        && !fixed_regs[regno]
        && !crtl->abi->clobbers_full_reg_p (regno))
       frame.reg_offset[regno] = SLOT_REQUIRED;
@@ -9937,7 +10008,8 @@ aarch64_layout_frame (void)
   /* If the current function changes the SVE vector length, ensure that the
      old value of the DWARF VG register is saved and available in the CFI,
      so that outer frames with VL-sized offsets can be processed correctly.  */
-  if (cfun->machine->call_switches_pstate_sm)
+  if (cfun->machine->call_switches_pstate_sm
+      || aarch64_cfun_enables_pstate_sm ())
     {
       frame.reg_offset[VG_REGNUM] = offset;
       offset += UNITS_PER_WORD;
@@ -10776,9 +10848,16 @@ aarch64_get_separate_components (void)
   bitmap_clear (components);
 
   /* The registers we need saved to the frame.  */
+  bool enables_pstate_sm = aarch64_cfun_enables_pstate_sm ();
   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
     if (aarch64_register_saved_on_entry (regno))
       {
+       /* Disallow shrink wrapping for registers that will be clobbered
+          by an SMSTART SM in the prologue.  */
+       if (enables_pstate_sm
+           && (FP_REGNUM_P (regno) || PR_REGNUM_P (regno)))
+         continue;
+
        /* Punt on saves and restores that use ST1D and LD1D.  We could
           try to be smarter, but it would involve making sure that the
           spare predicate register itself is safe to use at the save
@@ -11097,11 +11176,16 @@ aarch64_emit_stack_tie (rtx reg)
    events, e.g. if we were to allow the stack to be dropped by more than a page
    and then have multiple probes up and we take a signal somewhere in between
    then the signal handler doesn't know the state of the stack and can make no
-   assumptions about which pages have been probed.  */
+   assumptions about which pages have been probed.
+
+   FORCE_ISA_MODE is AARCH64_FL_SM_ON if any variable component of POLY_SIZE
+   is measured relative to the SME vector length instead of the current
+   prevailing vector length.  It is 0 otherwise.  */
 
 static void
 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
                                        poly_int64 poly_size,
+                                       aarch64_feature_flags force_isa_mode,
                                        bool frame_related_p,
                                        bool final_adjustment_p)
 {
@@ -11143,7 +11227,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx 
temp2,
   if (known_lt (poly_size, min_probe_threshold)
       || !flag_stack_clash_protection)
     {
-      aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
+      aarch64_sub_sp (temp1, temp2, poly_size, force_isa_mode,
+                     frame_related_p);
       return;
     }
 
@@ -11160,7 +11245,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx 
temp2,
 
       /* First calculate the amount of bytes we're actually spilling.  */
       aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
-                         poly_size, temp1, temp2, false, true);
+                         poly_size, temp1, temp2, force_isa_mode,
+                         false, true);
 
       rtx_insn *insn = get_last_insn ();
 
@@ -11218,7 +11304,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx 
temp2,
     {
       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
        {
-         aarch64_sub_sp (NULL, temp2, guard_size, true);
+         aarch64_sub_sp (NULL, temp2, guard_size, force_isa_mode, true);
          emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
                                           guard_used_by_caller));
          emit_insn (gen_blockage ());
@@ -11229,7 +11315,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx 
temp2,
     {
       /* Compute the ending address.  */
       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
-                         temp1, NULL, false, true);
+                         temp1, NULL, force_isa_mode, false, true);
       rtx_insn *insn = get_last_insn ();
 
       /* For the initial allocation, we don't have a frame pointer
@@ -11295,7 +11381,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx 
temp2,
       if (final_adjustment_p && rounded_size != 0)
        min_probe_threshold = 0;
 
-      aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
+      aarch64_sub_sp (temp1, temp2, residual, force_isa_mode, frame_related_p);
       if (residual >= min_probe_threshold)
        {
          if (dump_file)
@@ -11360,6 +11446,14 @@ aarch64_epilogue_uses (int regno)
   return 0;
 }
 
+/* Implement TARGET_USE_LATE_PROLOGUE_EPILOGUE.  */
+
+static bool
+aarch64_use_late_prologue_epilogue ()
+{
+  return aarch64_cfun_enables_pstate_sm ();
+}
+
 /* The current function's frame has a save slot for the incoming state
    of SVCR.  Return a legitimate memory for the slot, based on the hard
    frame pointer.  */
@@ -11496,6 +11590,9 @@ aarch64_expand_prologue (void)
   unsigned reg2 = frame.wb_push_candidate2;
   bool emit_frame_chain = frame.emit_frame_chain;
   rtx_insn *insn;
+  aarch64_feature_flags force_isa_mode = 0;
+  if (aarch64_cfun_enables_pstate_sm ())
+    force_isa_mode = AARCH64_FL_SM_ON;
 
   if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
     {
@@ -11557,7 +11654,7 @@ aarch64_expand_prologue (void)
      less the amount of the guard reserved for use by the caller's
      outgoing args.  */
   aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
-                                         true, false);
+                                         force_isa_mode, true, false);
 
   if (callee_adjust != 0)
     aarch64_push_regs (reg1, reg2, callee_adjust);
@@ -11580,7 +11677,8 @@ aarch64_expand_prologue (void)
        gcc_assert (known_eq (chain_offset, 0));
       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
                          stack_pointer_rtx, chain_offset,
-                         tmp1_rtx, tmp0_rtx, frame_pointer_needed);
+                         tmp1_rtx, tmp0_rtx, force_isa_mode,
+                         frame_pointer_needed);
       if (frame_pointer_needed && !frame_size.is_constant ())
        {
          /* Variable-sized frames need to describe the save slot
@@ -11627,6 +11725,7 @@ aarch64_expand_prologue (void)
                  || known_eq (initial_adjust, 0));
       aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
                                              sve_callee_adjust,
+                                             force_isa_mode,
                                              !frame_pointer_needed, false);
       bytes_below_sp -= sve_callee_adjust;
     }
@@ -11639,12 +11738,15 @@ aarch64_expand_prologue (void)
      that is assumed by the called.  */
   gcc_assert (known_eq (bytes_below_sp, final_adjust));
   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
+                                         force_isa_mode,
                                          !frame_pointer_needed, true);
   if (emit_frame_chain && maybe_ne (final_adjust, 0))
     aarch64_emit_stack_tie (hard_frame_pointer_rtx);
 
-  /* Save the incoming value of PSTATE.SM, if required.  */
-  if (known_ge (frame.old_svcr_offset, 0))
+  /* Save the incoming value of PSTATE.SM, if required.  Code further
+     down does this for locally-streaming functions.  */
+  if (known_ge (frame.old_svcr_offset, 0)
+      && !aarch64_cfun_enables_pstate_sm ())
     {
       rtx mem = aarch64_old_svcr_mem ();
       MEM_VOLATILE_P (mem) = 1;
@@ -11676,6 +11778,34 @@ aarch64_expand_prologue (void)
            emit_move_insn (gen_rtx_REG (DImode, R1_REGNUM), old_r1);
        }
     }
+
+  /* Enable PSTATE.SM, if required.  */
+  if (aarch64_cfun_enables_pstate_sm ())
+    {
+      rtx_insn *guard_label = nullptr;
+      if (known_ge (cfun->machine->frame.old_svcr_offset, 0))
+       {
+         /* The current function is streaming-compatible.  Save the
+            original state of PSTATE.SM.  */
+         rtx svcr = gen_rtx_REG (DImode, IP0_REGNUM);
+         emit_insn (gen_aarch64_read_svcr (svcr));
+         emit_move_insn (aarch64_old_svcr_mem (), svcr);
+         guard_label = aarch64_guard_switch_pstate_sm (svcr,
+                                                       aarch64_isa_flags);
+       }
+      aarch64_sme_mode_switch_regs args_switch;
+      auto &args = crtl->args.info;
+      for (unsigned int i = 0; i < args.num_sme_mode_switch_args; ++i)
+       {
+         rtx x = args.sme_mode_switch_args[i];
+         args_switch.add_reg (GET_MODE (x), REGNO (x));
+       }
+      args_switch.emit_prologue ();
+      emit_insn (gen_aarch64_smstart_sm ());
+      args_switch.emit_epilogue ();
+      if (guard_label)
+       emit_label (guard_label);
+    }
 }
 
 /* Return TRUE if we can use a simple_return insn.
@@ -11722,6 +11852,9 @@ aarch64_expand_epilogue (rtx_call_insn *sibcall)
   HOST_WIDE_INT guard_size
     = 1 << param_stack_clash_protection_guard_size;
   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
+  aarch64_feature_flags force_isa_mode = 0;
+  if (aarch64_cfun_enables_pstate_sm ())
+    force_isa_mode = AARCH64_FL_SM_ON;
 
   /* We can re-use the registers when:
 
@@ -11746,6 +11879,24 @@ aarch64_expand_epilogue (rtx_call_insn *sibcall)
     = maybe_ne (get_frame_size ()
                + frame.saved_varargs_size, 0);
 
+  /* Reset PSTATE.SM, if required.  */
+  if (aarch64_cfun_enables_pstate_sm ())
+    {
+      rtx_insn *guard_label = nullptr;
+      if (known_ge (cfun->machine->frame.old_svcr_offset, 0))
+       guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
+                                                     aarch64_isa_flags);
+      aarch64_sme_mode_switch_regs return_switch;
+      if (crtl->return_rtx && REG_P (crtl->return_rtx))
+       return_switch.add_reg (GET_MODE (crtl->return_rtx),
+                              REGNO (crtl->return_rtx));
+      return_switch.emit_prologue ();
+      emit_insn (gen_aarch64_smstop_sm ());
+      return_switch.emit_epilogue ();
+      if (guard_label)
+       emit_label (guard_label);
+    }
+
   /* Emit a barrier to prevent loads from a deallocated stack.  */
   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
       || cfun->calls_alloca
@@ -11766,19 +11917,21 @@ aarch64_expand_epilogue (rtx_call_insn *sibcall)
     aarch64_add_offset (Pmode, stack_pointer_rtx,
                        hard_frame_pointer_rtx,
                        -bytes_below_hard_fp + final_adjust,
-                       tmp1_rtx, tmp0_rtx, callee_adjust == 0);
+                       tmp1_rtx, tmp0_rtx, force_isa_mode,
+                       callee_adjust == 0);
   else
      /* The case where we need to re-use the register here is very rare, so
        avoid the complicated condition and just always emit a move if the
        immediate doesn't fit.  */
-     aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
+     aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, force_isa_mode, true);
 
   /* Restore the vector registers before the predicate registers,
      so that we can use P4 as a temporary for big-endian SVE frames.  */
   aarch64_restore_callee_saves (final_adjust, frame.saved_fprs, &cfi_ops);
   aarch64_restore_callee_saves (final_adjust, frame.saved_prs, &cfi_ops);
   if (maybe_ne (sve_callee_adjust, 0))
-    aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
+    aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust,
+                   force_isa_mode, true);
 
   /* When shadow call stack is enabled, the scs_pop in the epilogue will
      restore x30, we don't need to restore x30 again in the traditional
@@ -11808,7 +11961,7 @@ aarch64_expand_epilogue (rtx_call_insn *sibcall)
 
   /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
      add restriction on emit_move optimization to leaf functions.  */
-  aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
+  aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust, force_isa_mode,
                  (!can_inherit_p || !crtl->is_leaf
                   || df_regs_ever_live_p (EP0_REGNUM)));
 
@@ -11941,7 +12094,8 @@ aarch64_output_mi_thunk (FILE *file, tree thunk 
ATTRIBUTE_UNUSED,
   temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
 
   if (vcall_offset == 0)
-    aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
+    aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0,
+                       0, false);
   else
     {
       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
@@ -11954,7 +12108,7 @@ aarch64_output_mi_thunk (FILE *file, tree thunk 
ATTRIBUTE_UNUSED,
                                       plus_constant (Pmode, this_rtx, delta));
          else
            aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
-                               temp1, temp0, false);
+                               temp1, temp0, 0, false);
        }
 
       if (Pmode == ptr_mode)
@@ -31355,6 +31509,9 @@ aarch64_libgcc_floating_mode_supported_p
 #undef TARGET_EXTRA_LIVE_ON_ENTRY
 #define TARGET_EXTRA_LIVE_ON_ENTRY aarch64_extra_live_on_entry
 
+#undef TARGET_USE_LATE_PROLOGUE_EPILOGUE
+#define TARGET_USE_LATE_PROLOGUE_EPILOGUE aarch64_use_late_prologue_epilogue
+
 #undef TARGET_EMIT_EPILOGUE_FOR_SIBCALL
 #define TARGET_EMIT_EPILOGUE_FOR_SIBCALL aarch64_expand_epilogue
 
diff --git a/gcc/testsuite/g++.target/aarch64/sme/keyword_macros_1.C 
b/gcc/testsuite/g++.target/aarch64/sme/keyword_macros_1.C
index 8b0755014cc..dc5c097bd52 100644
--- a/gcc/testsuite/g++.target/aarch64/sme/keyword_macros_1.C
+++ b/gcc/testsuite/g++.target/aarch64/sme/keyword_macros_1.C
@@ -7,3 +7,4 @@ void f4 () __arm_out("za");
 void f5 () __arm_inout("za");
 void f6 () __arm_preserves("za");
 __arm_new("za") void f7 () {}
+__arm_locally_streaming void f8 () {}
diff --git a/gcc/testsuite/gcc.target/aarch64/sme/keyword_macros_1.c 
b/gcc/testsuite/gcc.target/aarch64/sme/keyword_macros_1.c
index fcabe3edc55..22f5facfdf9 100644
--- a/gcc/testsuite/gcc.target/aarch64/sme/keyword_macros_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sme/keyword_macros_1.c
@@ -7,3 +7,4 @@ void f4 () __arm_out("za");
 void f5 () __arm_inout("za");
 void f6 () __arm_preserves("za");
 __arm_new("za") void f7 () {}
+__arm_locally_streaming void f8 () {}
diff --git a/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_1.c 
b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_1.c
new file mode 100644
index 00000000000..20ff4b87d94
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_1.c
@@ -0,0 +1,466 @@
+// { dg-options "-O -fomit-frame-pointer" }
+// { dg-final { check-function-bodies "**" "" } }
+
+void consume_za () [[arm::streaming, arm::inout("za")]];
+
+/*
+** n_ls:
+**     sub     sp, sp, #?80
+**     cntd    x16
+**     str     x16, \[sp\]
+**     stp     d8, d9, \[sp, #?16\]
+**     stp     d10, d11, \[sp, #?32\]
+**     stp     d12, d13, \[sp, #?48\]
+**     stp     d14, d15, \[sp, #?64\]
+**     smstart sm
+**     smstop  sm
+**     ldp     d8, d9, \[sp, #?16\]
+**     ldp     d10, d11, \[sp, #?32\]
+**     ldp     d12, d13, \[sp, #?48\]
+**     ldp     d14, d15, \[sp, #?64\]
+**     add     sp, sp, #?80
+**     ret
+*/
+[[arm::locally_streaming]] void
+n_ls ()
+{
+  asm ("");
+}
+
+/*
+** s_ls:
+**     ret
+*/
+[[arm::locally_streaming]] void
+s_ls () [[arm::streaming]]
+{
+  asm ("");
+}
+
+/*
+** sc_ls:
+**     stp     x29, x30, \[sp, #?-96\]!
+**     mov     x29, sp
+**     cntd    x16
+**     str     x16, \[sp, #?24\]
+**     stp     d8, d9, \[sp, #?32\]
+**     stp     d10, d11, \[sp, #?48\]
+**     stp     d12, d13, \[sp, #?64\]
+**     stp     d14, d15, \[sp, #?80\]
+**     mrs     x16, svcr
+**     str     x16, \[x29, #?16\]
+**     tbnz    x16, 0, [^\n]+
+**     smstart sm
+**     ldr     x16, \[x29, #?16\]
+**     tbnz    x16, 0, [^\n]+
+**     smstop  sm
+**     ldp     d8, d9, \[sp, #?32\]
+**     ldp     d10, d11, \[sp, #?48\]
+**     ldp     d12, d13, \[sp, #?64\]
+**     ldp     d14, d15, \[sp, #?80\]
+**     ldp     x29, x30, \[sp\], #?96
+**     ret
+*/
+[[arm::locally_streaming]] void
+sc_ls () [[arm::streaming_compatible]]
+{
+  asm ("");
+}
+
+/*
+** n_ls_new_za:
+**     str     x30, \[sp, #?-80\]!
+**     cntd    x16
+**     str     x16, \[sp, #?8\]
+**     stp     d8, d9, \[sp, #?16\]
+**     stp     d10, d11, \[sp, #?32\]
+**     stp     d12, d13, \[sp, #?48\]
+**     stp     d14, d15, \[sp, #?64\]
+**     smstart sm
+**     mrs     (x[0-9]+), tpidr2_el0
+**     cbz     \1, [^\n]+
+**     bl      __arm_tpidr2_save
+**     msr     tpidr2_el0, xzr
+**     zero    { za }
+**     smstart za
+**     bl      consume_za
+**     smstop  za
+**     smstop  sm
+**     ldp     d8, d9, \[sp, #?16\]
+**     ldp     d10, d11, \[sp, #?32\]
+**     ldp     d12, d13, \[sp, #?48\]
+**     ldp     d14, d15, \[sp, #?64\]
+**     ldr     x30, \[sp\], #?80
+**     ret
+*/
+[[arm::locally_streaming, arm::new("za")]] void
+n_ls_new_za ()
+{
+  consume_za ();
+  asm ("");
+}
+
+/*
+** s_ls_new_za:
+**     str     x30, \[sp, #?-16\]!
+**     mrs     (x[0-9]+), tpidr2_el0
+**     cbz     \1, [^\n]+
+**     bl      __arm_tpidr2_save
+**     msr     tpidr2_el0, xzr
+**     zero    { za }
+**     smstart za
+**     bl      consume_za
+**     smstop  za
+**     ldr     x30, \[sp\], #?16
+**     ret
+*/
+[[arm::locally_streaming, arm::new("za")]] void
+s_ls_new_za () [[arm::streaming]]
+{
+  consume_za ();
+  asm ("");
+}
+
+/*
+** sc_ls_new_za:
+**     stp     x29, x30, \[sp, #?-96\]!
+**     mov     x29, sp
+**     cntd    x16
+**     str     x16, \[sp, #?24\]
+**     stp     d8, d9, \[sp, #?32\]
+**     stp     d10, d11, \[sp, #?48\]
+**     stp     d12, d13, \[sp, #?64\]
+**     stp     d14, d15, \[sp, #?80\]
+**     mrs     x16, svcr
+**     str     x16, \[x29, #?16\]
+**     tbnz    x16, 0, [^\n]+
+**     smstart sm
+**     mrs     (x[0-9]+), tpidr2_el0
+**     cbz     \1, [^\n]+
+**     bl      __arm_tpidr2_save
+**     msr     tpidr2_el0, xzr
+**     zero    { za }
+**     smstart za
+**     bl      consume_za
+**     smstop  za
+**     ldr     x16, \[x29, #?16\]
+**     tbnz    x16, 0, [^\n]+
+**     smstop  sm
+**     ldp     d8, d9, \[sp, #?32\]
+**     ldp     d10, d11, \[sp, #?48\]
+**     ldp     d12, d13, \[sp, #?64\]
+**     ldp     d14, d15, \[sp, #?80\]
+**     ldp     x29, x30, \[sp\], #?96
+**     ret
+*/
+[[arm::locally_streaming, arm::new("za")]] void
+sc_ls_new_za () [[arm::streaming_compatible]]
+{
+  consume_za ();
+  asm ("");
+}
+
+/*
+** n_ls_shared_za:
+**     str     x30, \[sp, #?-80\]!
+**     cntd    x16
+**     str     x16, \[sp, #?8\]
+**     stp     d8, d9, \[sp, #?16\]
+**     stp     d10, d11, \[sp, #?32\]
+**     stp     d12, d13, \[sp, #?48\]
+**     stp     d14, d15, \[sp, #?64\]
+**     smstart sm
+**     bl      consume_za
+**     smstop  sm
+**     ldp     d8, d9, \[sp, #?16\]
+**     ldp     d10, d11, \[sp, #?32\]
+**     ldp     d12, d13, \[sp, #?48\]
+**     ldp     d14, d15, \[sp, #?64\]
+**     ldr     x30, \[sp\], #?80
+**     ret
+*/
+[[arm::locally_streaming]] void
+n_ls_shared_za () [[arm::inout("za")]]
+{
+  consume_za ();
+  asm ("");
+}
+
+/*
+** s_ls_shared_za:
+**     str     x30, \[sp, #?-16\]!
+**     bl      consume_za
+**     ldr     x30, \[sp\], #?16
+**     ret
+*/
+[[arm::locally_streaming]] void
+s_ls_shared_za () [[arm::streaming, arm::inout("za")]]
+{
+  consume_za ();
+  asm ("");
+}
+
+/*
+** sc_ls_shared_za:
+**     stp     x29, x30, \[sp, #?-96\]!
+**     mov     x29, sp
+**     cntd    x16
+**     str     x16, \[sp, #?24\]
+**     stp     d8, d9, \[sp, #?32\]
+**     stp     d10, d11, \[sp, #?48\]
+**     stp     d12, d13, \[sp, #?64\]
+**     stp     d14, d15, \[sp, #?80\]
+**     mrs     x16, svcr
+**     str     x16, \[x29, #?16\]
+**     tbnz    x16, 0, [^\n]+
+**     smstart sm
+**     bl      consume_za
+**     ldr     x16, \[x29, #?16\]
+**     tbnz    x16, 0, [^\n]+
+**     smstop  sm
+**     ldp     d8, d9, \[sp, #?32\]
+**     ldp     d10, d11, \[sp, #?48\]
+**     ldp     d12, d13, \[sp, #?64\]
+**     ldp     d14, d15, \[sp, #?80\]
+**     ldp     x29, x30, \[sp\], #?96
+**     ret
+*/
+[[arm::locally_streaming]] void
+sc_ls_shared_za () [[arm::streaming_compatible, arm::inout("za")]]
+{
+  consume_za ();
+  asm ("");
+}
+
+/*
+** n_ls_vector_pcs:
+**     sub     sp, sp, #?272
+**     cntd    x16
+**     str     x16, \[sp\]
+**     stp     q8, q9, \[sp, #?16\]
+**     stp     q10, q11, \[sp, #?48\]
+**     stp     q12, q13, \[sp, #?80\]
+**     stp     q14, q15, \[sp, #?112\]
+**     stp     q16, q17, \[sp, #?144\]
+**     stp     q18, q19, \[sp, #?176\]
+**     stp     q20, q21, \[sp, #?208\]
+**     stp     q22, q23, \[sp, #?240\]
+**     smstart sm
+**     smstop  sm
+**     ldp     q8, q9, \[sp, #?16\]
+**     ldp     q10, q11, \[sp, #?48\]
+**     ldp     q12, q13, \[sp, #?80\]
+**     ldp     q14, q15, \[sp, #?112\]
+**     ldp     q16, q17, \[sp, #?144\]
+**     ldp     q18, q19, \[sp, #?176\]
+**     ldp     q20, q21, \[sp, #?208\]
+**     ldp     q22, q23, \[sp, #?240\]
+**     add     sp, sp, #?272
+**     ret
+*/
+[[arm::locally_streaming]] void __attribute__((aarch64_vector_pcs))
+n_ls_vector_pcs ()
+{
+  asm ("");
+}
+
+/*
+** n_ls_sve_pcs:
+**     sub     sp, sp, #?16
+**     cntd    x16
+**     str     x16, \[sp\]
+**     addsvl  sp, sp, #-18
+**     str     p4, \[sp\]
+**     str     p5, \[sp, #1, mul vl\]
+**     str     p6, \[sp, #2, mul vl\]
+**     str     p7, \[sp, #3, mul vl\]
+**     str     p8, \[sp, #4, mul vl\]
+**     str     p9, \[sp, #5, mul vl\]
+**     str     p10, \[sp, #6, mul vl\]
+**     str     p11, \[sp, #7, mul vl\]
+**     str     p12, \[sp, #8, mul vl\]
+**     str     p13, \[sp, #9, mul vl\]
+**     str     p14, \[sp, #10, mul vl\]
+**     str     p15, \[sp, #11, mul vl\]
+**     str     z8, \[sp, #2, mul vl\]
+**     str     z9, \[sp, #3, mul vl\]
+**     str     z10, \[sp, #4, mul vl\]
+**     str     z11, \[sp, #5, mul vl\]
+**     str     z12, \[sp, #6, mul vl\]
+**     str     z13, \[sp, #7, mul vl\]
+**     str     z14, \[sp, #8, mul vl\]
+**     str     z15, \[sp, #9, mul vl\]
+**     str     z16, \[sp, #10, mul vl\]
+**     str     z17, \[sp, #11, mul vl\]
+**     str     z18, \[sp, #12, mul vl\]
+**     str     z19, \[sp, #13, mul vl\]
+**     str     z20, \[sp, #14, mul vl\]
+**     str     z21, \[sp, #15, mul vl\]
+**     str     z22, \[sp, #16, mul vl\]
+**     str     z23, \[sp, #17, mul vl\]
+**     addvl   sp, sp, #-1
+**     str     p0, \[sp\]
+**     smstart sm
+**     ldr     p0, \[sp\]
+**     addvl   sp, sp, #1
+**     smstop  sm
+**     ldr     z8, \[sp, #2, mul vl\]
+**     ldr     z9, \[sp, #3, mul vl\]
+**     ldr     z10, \[sp, #4, mul vl\]
+**     ldr     z11, \[sp, #5, mul vl\]
+**     ldr     z12, \[sp, #6, mul vl\]
+**     ldr     z13, \[sp, #7, mul vl\]
+**     ldr     z14, \[sp, #8, mul vl\]
+**     ldr     z15, \[sp, #9, mul vl\]
+**     ldr     z16, \[sp, #10, mul vl\]
+**     ldr     z17, \[sp, #11, mul vl\]
+**     ldr     z18, \[sp, #12, mul vl\]
+**     ldr     z19, \[sp, #13, mul vl\]
+**     ldr     z20, \[sp, #14, mul vl\]
+**     ldr     z21, \[sp, #15, mul vl\]
+**     ldr     z22, \[sp, #16, mul vl\]
+**     ldr     z23, \[sp, #17, mul vl\]
+**     ldr     p4, \[sp\]
+**     ldr     p5, \[sp, #1, mul vl\]
+**     ldr     p6, \[sp, #2, mul vl\]
+**     ldr     p7, \[sp, #3, mul vl\]
+**     ldr     p8, \[sp, #4, mul vl\]
+**     ldr     p9, \[sp, #5, mul vl\]
+**     ldr     p10, \[sp, #6, mul vl\]
+**     ldr     p11, \[sp, #7, mul vl\]
+**     ldr     p12, \[sp, #8, mul vl\]
+**     ldr     p13, \[sp, #9, mul vl\]
+**     ldr     p14, \[sp, #10, mul vl\]
+**     ldr     p15, \[sp, #11, mul vl\]
+**     addsvl  sp, sp, #18
+**     add     sp, sp, #?16
+**     ret
+*/
+[[arm::locally_streaming]] void
+n_ls_sve_pcs (__SVBool_t x)
+{
+  asm ("");
+}
+
+/*
+** n_ls_v0:
+**     addsvl  sp, sp, #-1
+**     ...
+**     smstart sm
+**     add     x[0-9]+, [^\n]+
+**     smstop  sm
+**     ...
+**     addsvl  sp, sp, #1
+**     ...
+*/
+#define TEST(VN) __SVInt32_t VN; asm ("" :: "r" (&VN));
+[[arm::locally_streaming]] void
+n_ls_v0 ()
+{
+  TEST (v0);
+}
+
+/*
+** n_ls_v32:
+**     addsvl  sp, sp, #-32
+**     ...
+**     smstart sm
+**     ...
+**     smstop  sm
+**     ...
+**     rdsvl   (x[0-9]+), #1
+**     lsl     (x[0-9]+), \1, #?5
+**     add     sp, sp, \2
+**     ...
+*/
+[[arm::locally_streaming]] void
+n_ls_v32 ()
+{
+  TEST (v0);
+  TEST (v1);
+  TEST (v2);
+  TEST (v3);
+  TEST (v4);
+  TEST (v5);
+  TEST (v6);
+  TEST (v7);
+  TEST (v8);
+  TEST (v9);
+  TEST (v10);
+  TEST (v11);
+  TEST (v12);
+  TEST (v13);
+  TEST (v14);
+  TEST (v15);
+  TEST (v16);
+  TEST (v17);
+  TEST (v18);
+  TEST (v19);
+  TEST (v20);
+  TEST (v21);
+  TEST (v22);
+  TEST (v23);
+  TEST (v24);
+  TEST (v25);
+  TEST (v26);
+  TEST (v27);
+  TEST (v28);
+  TEST (v29);
+  TEST (v30);
+  TEST (v31);
+}
+
+/*
+** n_ls_v33:
+**     rdsvl   (x[0-9]+), #1
+**     mov     (x[0-9]+), #?33
+**     mul     (x[0-9]+), (?:\1, \2|\2, \1)
+**     sub     sp, sp, \3
+**     ...
+**     smstart sm
+**     ...
+**     smstop  sm
+**     ...
+**     rdsvl   (x[0-9]+), #1
+**     mov     (x[0-9]+), #?33
+**     mul     (x[0-9]+), (?:\4, \5|\5, \4)
+**     add     sp, sp, \6
+**     ...
+*/
+[[arm::locally_streaming]] void
+n_ls_v33 ()
+{
+  TEST (v0);
+  TEST (v1);
+  TEST (v2);
+  TEST (v3);
+  TEST (v4);
+  TEST (v5);
+  TEST (v6);
+  TEST (v7);
+  TEST (v8);
+  TEST (v9);
+  TEST (v10);
+  TEST (v11);
+  TEST (v12);
+  TEST (v13);
+  TEST (v14);
+  TEST (v15);
+  TEST (v16);
+  TEST (v17);
+  TEST (v18);
+  TEST (v19);
+  TEST (v20);
+  TEST (v21);
+  TEST (v22);
+  TEST (v23);
+  TEST (v24);
+  TEST (v25);
+  TEST (v26);
+  TEST (v27);
+  TEST (v28);
+  TEST (v29);
+  TEST (v30);
+  TEST (v31);
+  TEST (v32);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_2.c 
b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_2.c
new file mode 100644
index 00000000000..0eba993855f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_2.c
@@ -0,0 +1,177 @@
+// { dg-options "-O -fomit-frame-pointer" }
+// { dg-final { check-function-bodies "**" "" } }
+
+#include <arm_neon.h>
+#include <arm_sve.h>
+
+/*
+** test_d0:
+**     ...
+**     smstart sm
+**     ...
+**     fmov    x10, d0
+**     smstop  sm
+**     fmov    d0, x10
+**     ...
+*/
+[[arm::locally_streaming]] double
+test_d0 ()
+{
+  asm ("");
+  return 1.0f;
+}
+
+/*
+** test_d0_vec:
+**     ...
+**     smstart sm
+**     ...
+** (
+**     fmov    x10, d0
+** |
+**     umov    x10, v0.d\[0\]
+** )
+**     smstop  sm
+**     fmov    d0, x10
+**     ...
+*/
+[[arm::locally_streaming]] int8x8_t
+test_d0_vec ()
+{
+  asm ("");
+  return (int8x8_t) {};
+}
+
+/*
+** test_q0:
+**     ...
+**     smstart sm
+**     ...
+**     str     q0, \[sp, #?-16\]!
+**     smstop  sm
+**     ldr     q0, \[sp\], #?16
+**     ...
+*/
+[[arm::locally_streaming]] int8x16_t
+test_q0 ()
+{
+  asm ("");
+  return (int8x16_t) {};
+}
+
+/*
+** test_q1:
+**     ...
+**     smstart sm
+**     ...
+**     stp     q0, q1, \[sp, #?-32\]!
+**     smstop  sm
+**     ldp     q0, q1, \[sp\], #?32
+**     ...
+*/
+[[arm::locally_streaming]] int8x16x2_t
+test_q1 ()
+{
+  asm ("");
+  return (int8x16x2_t) {};
+}
+
+/*
+** test_q2:
+**     ...
+**     smstart sm
+**     ...
+**     stp     q0, q1, \[sp, #?-48\]!
+**     str     q2, \[sp, #?32\]
+**     smstop  sm
+**     ldr     q2, \[sp, #?32\]
+**     ldp     q0, q1, \[sp\], #?48
+**     ...
+*/
+[[arm::locally_streaming]] int8x16x3_t
+test_q2 ()
+{
+  asm ("");
+  return (int8x16x3_t) {};
+}
+
+/*
+** test_q3:
+**     ...
+**     smstart sm
+**     ...
+**     stp     q0, q1, \[sp, #?-64\]!
+**     stp     q2, q3, \[sp, #?32\]
+**     smstop  sm
+**     ldp     q2, q3, \[sp, #?32\]
+**     ldp     q0, q1, \[sp\], #?64
+**     ...
+*/
+[[arm::locally_streaming]] int8x16x4_t
+test_q3 ()
+{
+  asm ("");
+  return (int8x16x4_t) {};
+}
+
+/*
+** test_z0:
+**     ...
+**     smstart sm
+**     mov     z0\.b, #0
+**     addvl   sp, sp, #-1
+**     str     z0, \[sp\]
+**     smstop  sm
+**     ldr     z0, \[sp\]
+**     addvl   sp, sp, #1
+**     ...
+*/
+[[arm::locally_streaming]] svint8_t
+test_z0 ()
+{
+  asm ("");
+  return (svint8_t) {};
+}
+
+/*
+** test_z3:
+**     ...
+**     smstart sm
+**     ...
+**     addvl   sp, sp, #-4
+**     str     z0, \[sp\]
+**     str     z1, \[sp, #1, mul vl\]
+**     str     z2, \[sp, #2, mul vl\]
+**     str     z3, \[sp, #3, mul vl\]
+**     smstop  sm
+**     ldr     z0, \[sp\]
+**     ldr     z1, \[sp, #1, mul vl\]
+**     ldr     z2, \[sp, #2, mul vl\]
+**     ldr     z3, \[sp, #3, mul vl\]
+**     ...
+*/
+[[arm::locally_streaming]] svint8x4_t
+test_z3 ()
+{
+  asm ("");
+  return (svint8x4_t) {};
+}
+
+/*
+** test_p0:
+**     ...
+**     smstart sm
+**     pfalse  p0\.b
+**     addvl   sp, sp, #-1
+**     str     p0, \[sp\]
+**     smstop  sm
+**     ldr     p0, \[sp\]
+**     addvl   sp, sp, #1
+**     ...
+*/
+[[arm::locally_streaming]] svbool_t
+test_p0 ()
+{
+  asm ("");
+  return (svbool_t) {};
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_3.c 
b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_3.c
new file mode 100644
index 00000000000..2bdea6ac631
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_3.c
@@ -0,0 +1,273 @@
+// { dg-options "-O -fomit-frame-pointer" }
+// { dg-final { check-function-bodies "**" "" } }
+
+#include <arm_neon.h>
+#include <arm_sve.h>
+
+/*
+** test_d0:
+**     ...
+**     fmov    x10, d0
+**     smstart sm
+**     fmov    d0, x10
+**     smstop  sm
+**     ...
+*/
+[[arm::locally_streaming]] void
+test_d0 (double d0)
+{
+  asm ("");
+}
+
+/*
+** test_d7:
+**     ...
+**     fmov    x10, d0
+**     fmov    x11, d1
+**     fmov    x12, d2
+**     fmov    x13, d3
+**     fmov    x14, d4
+**     fmov    x15, d5
+**     fmov    x16, d6
+**     fmov    x17, d7
+**     smstart sm
+**     fmov    d0, x10
+**     fmov    d1, x11
+**     fmov    d2, x12
+**     fmov    d3, x13
+**     fmov    d4, x14
+**     fmov    d5, x15
+**     fmov    d6, x16
+**     fmov    d7, x17
+**     smstop  sm
+**     ...
+*/
+[[arm::locally_streaming]] void
+test_d7 (double d0, double d1, double d2, double d3,
+        double d4, double d5, double d6, double d7)
+{
+  asm ("");
+}
+
+/*
+** test_d0_vec:
+**     ...
+** (
+**     fmov    x10, d0
+** |
+**     umov    x10, v0.d\[0\]
+** )
+**     smstart sm
+**     fmov    d0, x10
+**     smstop  sm
+**     ...
+*/
+[[arm::locally_streaming]] void
+test_d0_vec (int8x8_t d0)
+{
+  asm ("");
+}
+
+/*
+** test_d7_vec:
+**     ...
+** (
+**     fmov    x10, d0
+**     fmov    x11, d1
+**     fmov    x12, d2
+**     fmov    x13, d3
+**     fmov    x14, d4
+**     fmov    x15, d5
+**     fmov    x16, d6
+**     fmov    x17, d7
+** |
+**     umov    x10, v0.d\[0\]
+**     umov    x11, v1.d\[0\]
+**     umov    x12, v2.d\[0\]
+**     umov    x13, v3.d\[0\]
+**     umov    x14, v4.d\[0\]
+**     umov    x15, v5.d\[0\]
+**     umov    x16, v6.d\[0\]
+**     umov    x17, v7.d\[0\]
+** )
+**     smstart sm
+**     fmov    d0, x10
+**     fmov    d1, x11
+**     fmov    d2, x12
+**     fmov    d3, x13
+**     fmov    d4, x14
+**     fmov    d5, x15
+**     fmov    d6, x16
+**     fmov    d7, x17
+**     smstop  sm
+**     ...
+*/
+[[arm::locally_streaming]] void
+test_d7_vec (int8x8_t d0, int8x8_t d1, int8x8_t d2, int8x8_t d3,
+            int8x8_t d4, int8x8_t d5, int8x8_t d6, int8x8_t d7)
+{
+  asm ("");
+}
+
+/*
+** test_q0:
+**     ...
+**     str     q0, \[sp, #?-16\]!
+**     smstart sm
+**     ldr     q0, \[sp\], #?16
+**     smstop  sm
+**     ...
+*/
+[[arm::locally_streaming]] void
+test_q0 (int8x16_t q0)
+{
+  asm ("");
+}
+
+/*
+** test_q7:
+**     ...
+**     stp     q0, q1, \[sp, #?-128\]!
+**     stp     q2, q3, \[sp, #?32\]
+**     stp     q4, q5, \[sp, #?64\]
+**     stp     q6, q7, \[sp, #?96\]
+**     smstart sm
+**     ldp     q2, q3, \[sp, #?32\]
+**     ldp     q4, q5, \[sp, #?64\]
+**     ldp     q6, q7, \[sp, #?96\]
+**     ldp     q0, q1, \[sp\], #?128
+**     smstop  sm
+**     ...
+*/
+[[arm::locally_streaming]] void
+test_q7 (int8x16x4_t q0, int8x16x4_t q4)
+{
+  asm ("");
+}
+
+/*
+** test_z0:
+**     ...
+**     addvl   sp, sp, #-1
+**     str     z0, \[sp\]
+**     smstart sm
+**     ldr     z0, \[sp\]
+**     addvl   sp, sp, #1
+**     smstop  sm
+**     ...
+*/
+[[arm::locally_streaming]] void
+test_z0 (svint8_t z0)
+{
+  asm ("");
+}
+
+/*
+** test_z7:
+**     ...
+**     addvl   sp, sp, #-8
+**     str     z0, \[sp\]
+**     str     z1, \[sp, #1, mul vl\]
+**     str     z2, \[sp, #2, mul vl\]
+**     str     z3, \[sp, #3, mul vl\]
+**     str     z4, \[sp, #4, mul vl\]
+**     str     z5, \[sp, #5, mul vl\]
+**     str     z6, \[sp, #6, mul vl\]
+**     str     z7, \[sp, #7, mul vl\]
+**     smstart sm
+**     ldr     z0, \[sp\]
+**     ldr     z1, \[sp, #1, mul vl\]
+**     ldr     z2, \[sp, #2, mul vl\]
+**     ldr     z3, \[sp, #3, mul vl\]
+**     ldr     z4, \[sp, #4, mul vl\]
+**     ldr     z5, \[sp, #5, mul vl\]
+**     ldr     z6, \[sp, #6, mul vl\]
+**     ldr     z7, \[sp, #7, mul vl\]
+**     addvl   sp, sp, #8
+**     smstop  sm
+**     ...
+*/
+[[arm::locally_streaming]] void
+test_z7 (svint8x4_t z0, svint8x4_t z4)
+{
+  asm ("");
+}
+
+/*
+** test_p0:
+**     ...
+**     addvl   sp, sp, #-1
+**     str     p0, \[sp\]
+**     smstart sm
+**     ldr     p0, \[sp\]
+**     addvl   sp, sp, #1
+**     smstop  sm
+**     ...
+*/
+[[arm::locally_streaming]] void
+test_p0 (svbool_t p0)
+{
+  asm ("");
+}
+
+/*
+** test_p3:
+**     ...
+**     addvl   sp, sp, #-1
+**     str     p0, \[sp\]
+**     str     p1, \[sp, #1, mul vl\]
+**     str     p2, \[sp, #2, mul vl\]
+**     str     p3, \[sp, #3, mul vl\]
+**     smstart sm
+**     ldr     p0, \[sp\]
+**     ldr     p1, \[sp, #1, mul vl\]
+**     ldr     p2, \[sp, #2, mul vl\]
+**     ldr     p3, \[sp, #3, mul vl\]
+**     addvl   sp, sp, #1
+**     smstop  sm
+**     ...
+*/
+[[arm::locally_streaming]] void
+test_p3 (svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
+{
+  asm ("");
+}
+
+/*
+** test_mixed:
+**     ...
+**     addvl   sp, sp, #-3
+**     str     p0, \[sp\]
+**     str     p1, \[sp, #1, mul vl\]
+**     str     p2, \[sp, #2, mul vl\]
+**     str     p3, \[sp, #3, mul vl\]
+**     str     z3, \[sp, #1, mul vl\]
+**     str     z7, \[sp, #2, mul vl\]
+**     stp     q2, q6, \[sp, #?-32\]!
+**     fmov    w10, s0
+**     fmov    x11, d1
+**     fmov    w12, s4
+**     fmov    x13, d5
+**     smstart sm
+**     fmov    s0, w10
+**     fmov    d1, x11
+**     fmov    s4, w12
+**     fmov    d5, x13
+**     ldp     q2, q6, \[sp\], #?32
+**     ldr     p0, \[sp\]
+**     ldr     p1, \[sp, #1, mul vl\]
+**     ldr     p2, \[sp, #2, mul vl\]
+**     ldr     p3, \[sp, #3, mul vl\]
+**     ldr     z3, \[sp, #1, mul vl\]
+**     ldr     z7, \[sp, #2, mul vl\]
+**     addvl   sp, sp, #3
+**     smstop  sm
+**     ...
+*/
+[[arm::locally_streaming]] void
+test_mixed (float s0, double d1, float32x4_t q2, svfloat32_t z3,
+           float s4, double d5, float64x2_t q6, svfloat64_t z7,
+           svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
+{
+  asm ("");
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_4.c 
b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_4.c
new file mode 100644
index 00000000000..42adeb152e9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_4.c
@@ -0,0 +1,145 @@
+// { dg-options "-O -fomit-frame-pointer" }
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_neon.h>
+#include <arm_sve.h>
+
+/*
+** test_d0:
+**     ...
+**     smstart sm
+**     ...
+**     fmov    x10, d0
+**     smstop  sm
+**     fmov    d0, x10
+**     ...
+**     smstart sm
+**     ...
+**     smstop  sm
+**     ...
+*/
+void consume_d0 (double d0);
+
+__arm_locally_streaming void
+test_d0 ()
+{
+  asm ("");
+  consume_d0 (1.0);
+  asm ("");
+}
+
+/*
+** test_d7:
+**     ...
+**     fmov    x10, d0
+**     fmov    x11, d1
+**     fmov    x12, d2
+**     fmov    x13, d3
+**     fmov    x14, d4
+**     fmov    x15, d5
+**     fmov    x16, d6
+**     fmov    x17, d7
+**     smstop  sm
+**     fmov    d0, x10
+**     fmov    d1, x11
+**     fmov    d2, x12
+**     fmov    d3, x13
+**     fmov    d4, x14
+**     fmov    d5, x15
+**     fmov    d6, x16
+**     fmov    d7, x17
+**     ...
+*/
+void consume_d7 (double d0, double d1, double d2, double d3,
+                double d4, double d5, double d6, double d7);
+__arm_locally_streaming void
+test_d7 ()
+{
+  asm ("");
+  consume_d7 (1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+  asm ("");
+}
+
+/*
+** test_q7:
+**     ...
+**     stp     q0, q1, \[sp, #?-128\]!
+**     stp     q2, q3, \[sp, #?32\]
+**     stp     q4, q5, \[sp, #?64\]
+**     stp     q6, q7, \[sp, #?96\]
+**     smstop  sm
+**     ldp     q2, q3, \[sp, #?32\]
+**     ldp     q4, q5, \[sp, #?64\]
+**     ldp     q6, q7, \[sp, #?96\]
+**     ldp     q0, q1, \[sp\], #?128
+**     ...
+*/
+void consume_q7 (int8x16x4_t q0, int8x16x4_t q4);
+
+__arm_locally_streaming void
+test_q7 (int8x16x4_t *ptr)
+{
+  asm ("");
+  consume_q7 (ptr[0], ptr[1]);
+  asm ("");
+}
+
+/*
+** test_z7:
+**     ...
+**     addvl   sp, sp, #-8
+**     str     z0, \[sp\]
+**     str     z1, \[sp, #1, mul vl\]
+**     str     z2, \[sp, #2, mul vl\]
+**     str     z3, \[sp, #3, mul vl\]
+**     str     z4, \[sp, #4, mul vl\]
+**     str     z5, \[sp, #5, mul vl\]
+**     str     z6, \[sp, #6, mul vl\]
+**     str     z7, \[sp, #7, mul vl\]
+**     smstop  sm
+**     ldr     z0, \[sp\]
+**     ldr     z1, \[sp, #1, mul vl\]
+**     ldr     z2, \[sp, #2, mul vl\]
+**     ldr     z3, \[sp, #3, mul vl\]
+**     ldr     z4, \[sp, #4, mul vl\]
+**     ldr     z5, \[sp, #5, mul vl\]
+**     ldr     z6, \[sp, #6, mul vl\]
+**     ldr     z7, \[sp, #7, mul vl\]
+**     addvl   sp, sp, #8
+**     ...
+*/
+void consume_z7 (svint8x4_t z0, svint8x4_t z4);
+
+__arm_locally_streaming void
+test_z7 (svint8x4_t *ptr1, svint8x4_t *ptr2)
+{
+  asm ("");
+  consume_z7 (*ptr1, *ptr2);
+  asm ("");
+}
+
+/*
+** test_p3:
+**     ...
+**     addvl   sp, sp, #-1
+**     str     p0, \[sp\]
+**     str     p1, \[sp, #1, mul vl\]
+**     str     p2, \[sp, #2, mul vl\]
+**     str     p3, \[sp, #3, mul vl\]
+**     smstop  sm
+**     ldr     p0, \[sp\]
+**     ldr     p1, \[sp, #1, mul vl\]
+**     ldr     p2, \[sp, #2, mul vl\]
+**     ldr     p3, \[sp, #3, mul vl\]
+**     addvl   sp, sp, #1
+**     ...
+*/
+void consume_p3 (svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3);
+
+__arm_locally_streaming void
+test_p3 (svbool_t *ptr1, svbool_t *ptr2, svbool_t *ptr3, svbool_t *ptr4)
+{
+  asm ("");
+  consume_p3 (*ptr1, *ptr2, *ptr3, *ptr4);
+  asm ("");
+}
-- 
2.25.1


Reply via email to