This patch adds support for the arm_locally_streaming attribute,
which allows a function to use SME internally without changing
the function's ABI.  The attribute is valid but redundant for
arm_streaming functions.

gcc/
        * config/aarch64/aarch64.cc (aarch64_attribute_table): Add
        arm_locally_streaming.
        (aarch64_fndecl_is_locally_streaming): New function.
        (aarch64_fndecl_sm_state): Handle arm_locally_streaming functions.
        (aarch64_cfun_enables_pstate_sm): New function.
        (aarch64_add_offset): Add an argument that specifies whether
        the streaming vector length should be used instead of the
        prevailing one.
        (aarch64_split_add_offset, aarch64_add_sp, aarch64_sub_sp): Likewise.
        (aarch64_allocate_and_probe_stack_space): Likewise.
        (aarch64_expand_mov_immediate): Update calls accordingly.
        (aarch64_need_old_pstate_sm): Return true for locally-streaming
        streaming-compatible functions.
        (aarch64_layout_frame): Force all call-preserved Z and P registers
        to be saved and restored if the function switches PSTATE.SM in the
        prologue.
        (aarch64_get_separate_components): Disable shrink-wrapping of
        such Z and P saves and restores.
        (aarch64_use_late_prologue_epilogue): New function.
        (aarch64_expand_prologue): Measure SVE lengths in the streaming
        vector length for locally-streaming functions, then emit code
        to enable streaming mode.  Combine separate SMSTART ZA and
        SMSTART SM instructions into a single SMSTART where possible.
        (aarch64_expand_epilogue): Likewise in reverse.
        (TARGET_USE_LATE_PROLOGUE_EPILOGUE): Define.
        * config/aarch64/aarch64-sme.md (UNSPEC_SMSTART): New unspec.
        (UNSPEC_SMSTOP): Likewise.
        (aarch64_smstart, aarch64_smstop): New patterns.

gcc/testsuite/
        * gcc.target/aarch64/sme/locally_streaming_1.c: New test.
        * gcc.target/aarch64/sme/locally_streaming_2.c: Likewise.
        * gcc.target/aarch64/sme/locally_streaming_3.c: Likewise.
---
 gcc/config/aarch64/aarch64-sme.md             |  82 ++++
 gcc/config/aarch64/aarch64.cc                 | 237 ++++++++--
 .../aarch64/sme/locally_streaming_1.c         | 433 ++++++++++++++++++
 .../aarch64/sme/locally_streaming_2.c         | 177 +++++++
 .../aarch64/sme/locally_streaming_3.c         | 273 +++++++++++
 5 files changed, 1164 insertions(+), 38 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_3.c

diff --git a/gcc/config/aarch64/aarch64-sme.md 
b/gcc/config/aarch64/aarch64-sme.md
index 7b3ccea2e11..70be7adba28 100644
--- a/gcc/config/aarch64/aarch64-sme.md
+++ b/gcc/config/aarch64/aarch64-sme.md
@@ -281,6 +281,88 @@ (define_insn_and_split "aarch64_restore_za"
     DONE;
   }
 )
+
+;; -------------------------------------------------------------------------
+;; ---- Combined PSTATE.SM and PSTATE.ZA management
+;; -------------------------------------------------------------------------
+;; Includes
+;; - SMSTART
+;; - SMSTOP
+;; -------------------------------------------------------------------------
+
+(define_c_enum "unspec" [
+  UNSPEC_SMSTART
+  UNSPEC_SMSTOP
+])
+
+;; Enable SM and ZA, starting with fresh ZA contents.  This is only valid when
+;; SME is present, but the pattern does not depend on TARGET_SME since it can
+;; be used conditionally.
+(define_insn "aarch64_smstart"
+  [(unspec_volatile [(const_int 0)] UNSPEC_SMSTART)
+   (clobber (reg:V4x16QI V0_REGNUM))
+   (clobber (reg:V4x16QI V4_REGNUM))
+   (clobber (reg:V4x16QI V8_REGNUM))
+   (clobber (reg:V4x16QI V12_REGNUM))
+   (clobber (reg:V4x16QI V16_REGNUM))
+   (clobber (reg:V4x16QI V20_REGNUM))
+   (clobber (reg:V4x16QI V24_REGNUM))
+   (clobber (reg:V4x16QI V28_REGNUM))
+   (clobber (reg:VNx16BI P0_REGNUM))
+   (clobber (reg:VNx16BI P1_REGNUM))
+   (clobber (reg:VNx16BI P2_REGNUM))
+   (clobber (reg:VNx16BI P3_REGNUM))
+   (clobber (reg:VNx16BI P4_REGNUM))
+   (clobber (reg:VNx16BI P5_REGNUM))
+   (clobber (reg:VNx16BI P6_REGNUM))
+   (clobber (reg:VNx16BI P7_REGNUM))
+   (clobber (reg:VNx16BI P8_REGNUM))
+   (clobber (reg:VNx16BI P9_REGNUM))
+   (clobber (reg:VNx16BI P10_REGNUM))
+   (clobber (reg:VNx16BI P11_REGNUM))
+   (clobber (reg:VNx16BI P12_REGNUM))
+   (clobber (reg:VNx16BI P13_REGNUM))
+   (clobber (reg:VNx16BI P14_REGNUM))
+   (clobber (reg:VNx16BI P15_REGNUM))
+   (clobber (reg:VNx16QI ZA_REGNUM))]
+  ""
+  "smstart"
+)
+
+;; Disable SM and ZA, and discard its current contents.  This is only valid
+;; when SME is present, but the pattern does not depend on TARGET_SME since
+;; it can be used conditionally.
+(define_insn "aarch64_smstop"
+  [(unspec_volatile [(reg:VNx16QI OLD_ZA_REGNUM)] UNSPEC_SMSTOP)
+   (clobber (reg:V4x16QI V0_REGNUM))
+   (clobber (reg:V4x16QI V4_REGNUM))
+   (clobber (reg:V4x16QI V8_REGNUM))
+   (clobber (reg:V4x16QI V12_REGNUM))
+   (clobber (reg:V4x16QI V16_REGNUM))
+   (clobber (reg:V4x16QI V20_REGNUM))
+   (clobber (reg:V4x16QI V24_REGNUM))
+   (clobber (reg:V4x16QI V28_REGNUM))
+   (clobber (reg:VNx16BI P0_REGNUM))
+   (clobber (reg:VNx16BI P1_REGNUM))
+   (clobber (reg:VNx16BI P2_REGNUM))
+   (clobber (reg:VNx16BI P3_REGNUM))
+   (clobber (reg:VNx16BI P4_REGNUM))
+   (clobber (reg:VNx16BI P5_REGNUM))
+   (clobber (reg:VNx16BI P6_REGNUM))
+   (clobber (reg:VNx16BI P7_REGNUM))
+   (clobber (reg:VNx16BI P8_REGNUM))
+   (clobber (reg:VNx16BI P9_REGNUM))
+   (clobber (reg:VNx16BI P10_REGNUM))
+   (clobber (reg:VNx16BI P11_REGNUM))
+   (clobber (reg:VNx16BI P12_REGNUM))
+   (clobber (reg:VNx16BI P13_REGNUM))
+   (clobber (reg:VNx16BI P14_REGNUM))
+   (clobber (reg:VNx16BI P15_REGNUM))
+   (clobber (reg:VNx16QI ZA_REGNUM))]
+  ""
+  "smstop"
+)
+
 ;; =========================================================================
 ;; == Loads, stores and moves
 ;; =========================================================================
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 966d13abe4c..48bf2de4b3d 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -2790,6 +2790,7 @@ static const struct attribute_spec 
aarch64_attribute_table[] =
                          NULL, attr_streaming_exclusions },
   { "arm_streaming_compatible", 0, 0, false, true,  true,  true,
                          NULL, attr_streaming_exclusions },
+  { "arm_locally_streaming", 0, 0, true, false, false, false, NULL, NULL },
   { "arm_new_za",        0, 0, true, false, false, false,
                          handle_arm_new_za_attribute,
                          attr_arm_new_za_exclusions },
@@ -4162,6 +4163,15 @@ aarch64_fndecl_has_new_za_state (const_tree fndecl)
   return lookup_attribute ("arm_new_za", DECL_ATTRIBUTES (fndecl));
 }
 
+/* Return true if FNDECL uses streaming mode internally, as an
+   implementation choice.  */
+
+static bool
+aarch64_fndecl_is_locally_streaming (const_tree fndecl)
+{
+  return lookup_attribute ("arm_locally_streaming", DECL_ATTRIBUTES (fndecl));
+}
+
 /* Return the state of PSTATE.SM when compiling the body of
    function FNDECL.  This might be different from the state of
    PSTATE.SM on entry.  */
@@ -4169,6 +4179,9 @@ aarch64_fndecl_has_new_za_state (const_tree fndecl)
 static aarch64_feature_flags
 aarch64_fndecl_sm_state (const_tree fndecl)
 {
+  if (aarch64_fndecl_is_locally_streaming (fndecl))
+    return AARCH64_FL_SM_ON;
+
   return aarch64_fntype_sm_state (TREE_TYPE (fndecl));
 }
 
@@ -4222,6 +4235,16 @@ aarch64_cfun_incoming_za_state ()
   return aarch64_fntype_za_state (TREE_TYPE (cfun->decl));
 }
 
+/* Return true if PSTATE.SM is 1 in the body of the current function,
+   but is not guaranteed to be 1 on entry.  */
+
+static bool
+aarch64_cfun_enables_pstate_sm ()
+{
+  return (aarch64_fndecl_is_locally_streaming (cfun->decl)
+         && aarch64_cfun_incoming_sm_state () != AARCH64_FL_SM_ON);
+}
+
 /* Return true if the current function creates new ZA state (as opposed
    to sharing ZA with its callers or ignoring ZA altogether).  */
 
@@ -6432,6 +6455,10 @@ aarch64_add_offset_temporaries (rtx x)
    TEMP2, if nonnull, is a second temporary register that doesn't
    overlap either DEST or REG.
 
+   FORCE_ISA_MODE is AARCH64_FL_SM_ON if any variable component of OFFSET
+   is measured relative to the SME vector length instead of the current
+   prevailing vector length.  It is 0 otherwise.
+
    Since this function may be used to adjust the stack pointer, we must
    ensure that it cannot cause transient stack deallocation (for example
    by first incrementing SP and then decrementing when adjusting by a
@@ -6440,6 +6467,7 @@ aarch64_add_offset_temporaries (rtx x)
 static void
 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
                    poly_int64 offset, rtx temp1, rtx temp2,
+                   aarch64_feature_flags force_isa_mode,
                    bool frame_related_p, bool emit_move_imm = true)
 {
   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
@@ -6452,9 +6480,17 @@ aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx 
src,
   /* Try using ADDVL or ADDPL to add the whole value.  */
   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
     {
-      rtx offset_rtx = gen_int_mode (offset, mode);
+      rtx offset_rtx;
+      if (force_isa_mode == 0)
+       offset_rtx = gen_int_mode (offset, mode);
+      else
+       offset_rtx = aarch64_sme_vq_immediate (mode, offset.coeffs[0], 0);
       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
       RTX_FRAME_RELATED_P (insn) = frame_related_p;
+      if (frame_related_p && (force_isa_mode & AARCH64_FL_SM_ON))
+       add_reg_note (insn, REG_CFA_ADJUST_CFA,
+                     gen_rtx_SET (dest, plus_constant (Pmode, src,
+                                                       offset)));
       return;
     }
 
@@ -6470,11 +6506,19 @@ aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx 
src,
   if (src != const0_rtx
       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
     {
-      rtx offset_rtx = gen_int_mode (poly_offset, mode);
+      rtx offset_rtx;
+      if (force_isa_mode == 0)
+       offset_rtx = gen_int_mode (poly_offset, mode);
+      else
+       offset_rtx = aarch64_sme_vq_immediate (mode, factor, 0);
       if (frame_related_p)
        {
          rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
          RTX_FRAME_RELATED_P (insn) = true;
+         if (force_isa_mode & AARCH64_FL_SM_ON)
+           add_reg_note (insn, REG_CFA_ADJUST_CFA,
+                         gen_rtx_SET (dest, plus_constant (Pmode, src,
+                                                           poly_offset)));
          src = dest;
        }
       else
@@ -6505,8 +6549,18 @@ aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx 
src,
       rtx val;
       if (IN_RANGE (rel_factor, -32, 31))
        {
+         if (force_isa_mode & AARCH64_FL_SM_ON)
+           {
+             /* Try to use an unshifted RDSVL, otherwise fall back on
+                a shifted RDSVL #1.  */
+             if (aarch64_sve_rdvl_addvl_factor_p (factor))
+               shift = 0;
+             else
+               factor = rel_factor * 16;
+             val = aarch64_sme_vq_immediate (mode, factor, 0);
+           }
          /* Try to use an unshifted CNT[BHWD].  */
-         if (aarch64_sve_cnt_factor_p (factor))
+         else if (aarch64_sve_cnt_factor_p (factor))
            {
              val = gen_int_mode (poly_int64 (factor, factor), mode);
              shift = 0;
@@ -6542,12 +6596,19 @@ aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx 
src,
             a shift and add sequence for the multiplication.
             If CNTB << SHIFT is out of range, stick with the current
             shift factor.  */
-         if (IN_RANGE (low_bit, 2, 16 * 16))
+         if (force_isa_mode == 0
+             && IN_RANGE (low_bit, 2, 16 * 16))
            {
              val = gen_int_mode (poly_int64 (low_bit, low_bit), mode);
              shift = 0;
            }
-         else
+         else if ((force_isa_mode & AARCH64_FL_SM_ON)
+                  && aarch64_sve_rdvl_addvl_factor_p (low_bit))
+           {
+             val = aarch64_sme_vq_immediate (mode, low_bit, 0);
+             shift = 0;
+           }
+         else
            val = gen_int_mode (BYTES_PER_SVE_VECTOR, mode);
 
          val = aarch64_force_temporary (mode, temp1, val);
@@ -6634,30 +6695,34 @@ aarch64_split_add_offset (scalar_int_mode mode, rtx 
dest, rtx src,
                          rtx offset_rtx, rtx temp1, rtx temp2)
 {
   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
-                     temp1, temp2, false);
+                     temp1, temp2, 0, false);
 }
 
 /* Add DELTA to the stack pointer, marking the instructions frame-related.
-   TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
-   if TEMP1 already contains abs (DELTA).  */
+   TEMP1 is available as a temporary if nonnull.  FORCE_ISA_MODE is as
+   for aarch64_add_offset.  EMIT_MOVE_IMM is false if TEMP1 already
+   contains abs (DELTA).  */
 
 static inline void
-aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
+aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta,
+               aarch64_feature_flags force_isa_mode, bool emit_move_imm)
 {
   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
-                     temp1, temp2, true, emit_move_imm);
+                     temp1, temp2, force_isa_mode, true, emit_move_imm);
 }
 
 /* Subtract DELTA from the stack pointer, marking the instructions
-   frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
-   if nonnull.  */
+   frame-related if FRAME_RELATED_P.  FORCE_ISA_MODE is as for
+   aarch64_add_offset.  TEMP1 is available as a temporary if nonnull.  */
 
 static inline void
-aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
-               bool emit_move_imm = true)
+aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta,
+               aarch64_feature_flags force_isa_mode,
+               bool frame_related_p, bool emit_move_imm = true)
 {
   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
-                     temp1, temp2, frame_related_p, emit_move_imm);
+                     temp1, temp2, force_isa_mode, frame_related_p,
+                     emit_move_imm);
 }
 
 /* A streaming-compatible function needs to switch temporarily to the known
@@ -7673,11 +7738,11 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm)
                {
                  base = aarch64_force_temporary (int_mode, dest, base);
                  aarch64_add_offset (int_mode, dest, base, offset,
-                                     NULL_RTX, NULL_RTX, false);
+                                     NULL_RTX, NULL_RTX, 0, false);
                }
              else
                aarch64_add_offset (int_mode, dest, base, offset,
-                                   dest, NULL_RTX, false);
+                                   dest, NULL_RTX, 0, false);
            }
          return;
        }
@@ -7704,7 +7769,7 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm)
              gcc_assert (can_create_pseudo_p ());
              base = aarch64_force_temporary (int_mode, dest, base);
              aarch64_add_offset (int_mode, dest, base, const_offset,
-                                 NULL_RTX, NULL_RTX, false);
+                                 NULL_RTX, NULL_RTX, 0, false);
              return;
            }
 
@@ -7744,7 +7809,7 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm)
              gcc_assert(can_create_pseudo_p ());
              base = aarch64_force_temporary (int_mode, dest, base);
              aarch64_add_offset (int_mode, dest, base, const_offset,
-                                 NULL_RTX, NULL_RTX, false);
+                                 NULL_RTX, NULL_RTX, 0, false);
              return;
            }
          /* FALLTHRU */
@@ -9212,6 +9277,9 @@ aarch64_need_old_pstate_sm ()
   if (aarch64_cfun_incoming_sm_state () != 0)
     return false;
 
+  if (aarch64_cfun_enables_pstate_sm ())
+    return true;
+
   if (cfun->machine->call_switches_sm_state)
     for (auto insn = get_insns (); insn; insn = NEXT_INSN (insn))
       if (auto *call = dyn_cast<rtx_call_insn *> (insn))
@@ -9238,6 +9306,7 @@ aarch64_layout_frame (void)
   poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
   bool frame_related_fp_reg_p = false;
   aarch64_frame &frame = cfun->machine->frame;
+  bool enables_pstate_sm = aarch64_cfun_enables_pstate_sm ();
 
   frame.emit_frame_chain = aarch64_needs_frame_chain ();
 
@@ -9277,7 +9346,7 @@ aarch64_layout_frame (void)
       frame.reg_offset[regno] = SLOT_REQUIRED;
 
   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
-    if (df_regs_ever_live_p (regno)
+    if ((enables_pstate_sm || df_regs_ever_live_p (regno))
        && !fixed_regs[regno]
        && !crtl->abi->clobbers_full_reg_p (regno))
       {
@@ -9306,7 +9375,7 @@ aarch64_layout_frame (void)
     }
 
   for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
-    if (df_regs_ever_live_p (regno)
+    if ((enables_pstate_sm || df_regs_ever_live_p (regno))
        && !fixed_regs[regno]
        && !crtl->abi->clobbers_full_reg_p (regno))
       frame.reg_offset[regno] = SLOT_REQUIRED;
@@ -10121,9 +10190,14 @@ aarch64_get_separate_components (void)
   bitmap_clear (components);
 
   /* The registers we need saved to the frame.  */
+  bool enables_pstate_sm = aarch64_cfun_enables_pstate_sm ();
   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
     if (aarch64_register_saved_on_entry (regno))
       {
+       if (enables_pstate_sm
+           && (FP_REGNUM_P (regno) || PR_REGNUM_P (regno)))
+         continue;
+
        /* Punt on saves and restores that use ST1D and LD1D.  We could
           try to be smarter, but it would involve making sure that the
           spare predicate register itself is safe to use at the save
@@ -10438,6 +10512,7 @@ aarch64_stack_clash_protection_alloca_probe_range (void)
 static void
 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
                                        poly_int64 poly_size,
+                                       aarch64_feature_flags force_isa_mode,
                                        bool frame_related_p,
                                        bool final_adjustment_p)
 {
@@ -10498,7 +10573,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx 
temp2,
   if (known_lt (poly_size, min_probe_threshold)
       || !flag_stack_clash_protection)
     {
-      aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
+      aarch64_sub_sp (temp1, temp2, poly_size, force_isa_mode,
+                     frame_related_p);
       return;
     }
 
@@ -10515,7 +10591,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx 
temp2,
 
       /* First calculate the amount of bytes we're actually spilling.  */
       aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
-                         poly_size, temp1, temp2, false, true);
+                         poly_size, temp1, temp2, force_isa_mode,
+                         false, true);
 
       rtx_insn *insn = get_last_insn ();
 
@@ -10573,7 +10650,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx 
temp2,
     {
       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
        {
-         aarch64_sub_sp (NULL, temp2, guard_size, true);
+         aarch64_sub_sp (NULL, temp2, guard_size, force_isa_mode, true);
          emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
                                           guard_used_by_caller));
          emit_insn (gen_blockage ());
@@ -10584,7 +10661,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx 
temp2,
     {
       /* Compute the ending address.  */
       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
-                         temp1, NULL, false, true);
+                         temp1, NULL, force_isa_mode, false, true);
       rtx_insn *insn = get_last_insn ();
 
       /* For the initial allocation, we don't have a frame pointer
@@ -10654,7 +10731,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx 
temp2,
       else if (final_adjustment_p && rounded_size == 0)
        residual_probe_offset = 0;
 
-      aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
+      aarch64_sub_sp (temp1, temp2, residual, force_isa_mode, frame_related_p);
       if (residual >= min_probe_threshold)
        {
          if (dump_file)
@@ -10670,6 +10747,14 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx 
temp2,
     }
 }
 
+/* Implement TARGET_USE_LATE_PROLOGUE_EPILOGUE.  */
+
+static bool
+aarch64_use_late_prologue_epilogue ()
+{
+  return aarch64_cfun_enables_pstate_sm ();
+}
+
 /* Return 1 if the register is used by the epilogue.  We need to say the
    return register is used, but only after epilogue generation is complete.
    Note that in the case of sibcalls, the values "used by the epilogue" are
@@ -10826,6 +10911,9 @@ aarch64_expand_prologue (void)
   unsigned reg2 = cfun->machine->frame.wb_push_candidate2;
   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
   rtx_insn *insn;
+  aarch64_feature_flags force_isa_mode = 0;
+  if (aarch64_cfun_enables_pstate_sm ())
+    force_isa_mode = AARCH64_FL_SM_ON;
 
   if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
     {
@@ -10887,7 +10975,7 @@ aarch64_expand_prologue (void)
      less the amount of the guard reserved for use by the caller's
      outgoing args.  */
   aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
-                                         true, false);
+                                         force_isa_mode, true, false);
 
   if (callee_adjust != 0)
     aarch64_push_regs (reg1, reg2, callee_adjust);
@@ -10913,7 +11001,8 @@ aarch64_expand_prologue (void)
        gcc_assert (known_eq (chain_offset, 0));
       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
                          stack_pointer_rtx, chain_offset,
-                         tmp1_rtx, tmp0_rtx, frame_pointer_needed);
+                         tmp1_rtx, tmp0_rtx, force_isa_mode,
+                         frame_pointer_needed);
       if (frame_pointer_needed && !frame_size.is_constant ())
        {
          /* Variable-sized frames need to describe the save slot
@@ -10956,6 +11045,7 @@ aarch64_expand_prologue (void)
                  || known_eq (initial_adjust, 0));
       aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
                                              sve_callee_adjust,
+                                             force_isa_mode,
                                              !frame_pointer_needed, false);
       saved_regs_offset += sve_callee_adjust;
     }
@@ -10968,10 +11058,13 @@ aarch64_expand_prologue (void)
   /* We may need to probe the final adjustment if it is larger than the guard
      that is assumed by the called.  */
   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
+                                         force_isa_mode,
                                          !frame_pointer_needed, true);
 
-  /* Save the incoming value of PSTATE.SM, if required.  */
-  if (known_ge (cfun->machine->frame.old_svcr_offset, 0))
+  /* Save the incoming value of PSTATE.SM, if required.  Code further
+     down does this for locally-streaming functions.  */
+  if (known_ge (cfun->machine->frame.old_svcr_offset, 0)
+      && !aarch64_cfun_enables_pstate_sm ())
     {
       rtx mem = aarch64_old_svcr_mem ();
       MEM_VOLATILE_P (mem) = 1;
@@ -11022,7 +11115,40 @@ aarch64_expand_prologue (void)
       emit_insn (gen_aarch64_tpidr2_save ());
       emit_insn (gen_aarch64_clear_tpidr2 ());
       emit_label (label);
-      emit_insn (gen_aarch64_smstart_za ());
+      if (!aarch64_cfun_enables_pstate_sm ()
+         || known_ge (cfun->machine->frame.old_svcr_offset, 0))
+       emit_insn (gen_aarch64_smstart_za ());
+    }
+
+  /* Enable PSTATE.SM, if required.  */
+  if (aarch64_cfun_enables_pstate_sm ())
+    {
+      rtx_insn *guard_label = nullptr;
+      if (known_ge (cfun->machine->frame.old_svcr_offset, 0))
+       {
+         /* The current function is streaming-compatible.  Save the
+            original state of PSTATE.SM.  */
+         rtx svcr = gen_rtx_REG (DImode, IP0_REGNUM);
+         emit_insn (gen_aarch64_read_svcr (svcr));
+         emit_move_insn (aarch64_old_svcr_mem (), svcr);
+         guard_label = aarch64_guard_switch_pstate_sm (svcr,
+                                                       aarch64_isa_flags);
+       }
+      aarch64_sme_mode_switch_regs args_switch;
+      auto &args = crtl->args.info;
+      for (unsigned int i = 0; i < args.num_sme_mode_switch_args; ++i)
+       {
+         rtx x = args.sme_mode_switch_args[i];
+         args_switch.add_reg (GET_MODE (x), REGNO (x));
+       }
+      args_switch.emit_prologue ();
+      if (cfun->machine->frame.has_new_za_state && !guard_label)
+       emit_insn (gen_aarch64_smstart ());
+      else
+       emit_insn (gen_aarch64_smstart_sm ());
+      args_switch.emit_epilogue ();
+      if (guard_label)
+       emit_label (guard_label);
     }
 }
 
@@ -11073,6 +11199,9 @@ aarch64_expand_epilogue (rtx_call_insn *sibcall)
   HOST_WIDE_INT guard_size
     = 1 << param_stack_clash_protection_guard_size;
   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
+  aarch64_feature_flags force_isa_mode = 0;
+  if (aarch64_cfun_enables_pstate_sm ())
+    force_isa_mode = AARCH64_FL_SM_ON;
 
   /* We can re-use the registers when:
 
@@ -11097,7 +11226,33 @@ aarch64_expand_epilogue (rtx_call_insn *sibcall)
     = maybe_ne (get_frame_size ()
                + cfun->machine->frame.saved_varargs_size, 0);
 
-  if (cfun->machine->frame.has_new_za_state)
+  /* Reset PSTATE.SM, if required.  Fold an unconditional SMSTOP SM
+     and SMSTOP ZA into a single SMSTOP.  */
+  bool pending_smstop_za = cfun->machine->frame.has_new_za_state;
+  if (aarch64_cfun_enables_pstate_sm ())
+    {
+      rtx_insn *guard_label = nullptr;
+      if (known_ge (cfun->machine->frame.old_svcr_offset, 0))
+       guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
+                                                     aarch64_isa_flags);
+      aarch64_sme_mode_switch_regs args_switch;
+      if (crtl->return_rtx && REG_P (crtl->return_rtx))
+       args_switch.add_reg (GET_MODE (crtl->return_rtx),
+                            REGNO (crtl->return_rtx));
+      args_switch.emit_prologue ();
+      if (pending_smstop_za && !guard_label)
+       {
+         emit_insn (gen_aarch64_smstop ());
+         pending_smstop_za = false;
+       }
+      else
+       emit_insn (gen_aarch64_smstop_sm ());
+      args_switch.emit_epilogue ();
+      if (guard_label)
+       emit_label (guard_label);
+    }
+
+  if (pending_smstop_za)
     /* Turn ZA off before returning.  TPIDR2_EL0 is already null at
        this point.  */
     emit_insn (gen_aarch64_smstop_za ());
@@ -11122,12 +11277,13 @@ aarch64_expand_epilogue (rtx_call_insn *sibcall)
     aarch64_add_offset (Pmode, stack_pointer_rtx,
                        hard_frame_pointer_rtx,
                        -callee_offset - below_hard_fp_saved_regs_size,
-                       tmp1_rtx, tmp0_rtx, callee_adjust == 0);
+                       tmp1_rtx, tmp0_rtx, force_isa_mode,
+                       callee_adjust == 0);
   else
      /* The case where we need to re-use the register here is very rare, so
        avoid the complicated condition and just always emit a move if the
        immediate doesn't fit.  */
-     aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
+     aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, force_isa_mode, true);
 
   /* Restore the vector registers before the predicate registers,
      so that we can use P4 as a temporary for big-endian SVE frames.  */
@@ -11136,7 +11292,8 @@ aarch64_expand_epilogue (rtx_call_insn *sibcall)
   aarch64_restore_callee_saves (callee_offset, P0_REGNUM, P15_REGNUM,
                                false, &cfi_ops);
   if (maybe_ne (sve_callee_adjust, 0))
-    aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
+    aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust,
+                   force_isa_mode, true);
 
   /* When shadow call stack is enabled, the scs_pop in the epilogue will
      restore x30, we don't need to restore x30 again in the traditional
@@ -11167,7 +11324,7 @@ aarch64_expand_epilogue (rtx_call_insn *sibcall)
 
   /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
      add restriction on emit_move optimization to leaf functions.  */
-  aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
+  aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust, force_isa_mode,
                  (!can_inherit_p || !crtl->is_leaf
                   || df_regs_ever_live_p (EP0_REGNUM)));
 
@@ -11300,7 +11457,8 @@ aarch64_output_mi_thunk (FILE *file, tree thunk 
ATTRIBUTE_UNUSED,
   temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
 
   if (vcall_offset == 0)
-    aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
+    aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0,
+                       0, false);
   else
     {
       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
@@ -11313,7 +11471,7 @@ aarch64_output_mi_thunk (FILE *file, tree thunk 
ATTRIBUTE_UNUSED,
                                       plus_constant (Pmode, this_rtx, delta));
          else
            aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
-                               temp1, temp0, false);
+                               temp1, temp0, 0, false);
        }
 
       if (Pmode == ptr_mode)
@@ -29469,6 +29627,9 @@ aarch64_libgcc_floating_mode_supported_p
 #undef TARGET_HAVE_SHADOW_CALL_STACK
 #define TARGET_HAVE_SHADOW_CALL_STACK true
 
+#undef TARGET_USE_LATE_PROLOGUE_EPILOGUE
+#define TARGET_USE_LATE_PROLOGUE_EPILOGUE aarch64_use_late_prologue_epilogue
+
 #undef TARGET_EMIT_EPILOGUE_FOR_SIBCALL
 #define TARGET_EMIT_EPILOGUE_FOR_SIBCALL aarch64_expand_epilogue
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_1.c 
b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_1.c
new file mode 100644
index 00000000000..ab9c8cd6bac
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_1.c
@@ -0,0 +1,433 @@
+// { dg-options "-O -fomit-frame-pointer -fno-optimize-sibling-calls" }
+// { dg-final { check-function-bodies "**" "" } }
+
+__attribute__((arm_streaming, arm_shared_za)) void consume_za ();
+
+/*
+** n_ls:
+**     stp     d8, d9, \[sp, #?-64\]!
+**     stp     d10, d11, \[sp, #?16\]
+**     stp     d12, d13, \[sp, #?32\]
+**     stp     d14, d15, \[sp, #?48\]
+**     smstart sm
+**     smstop  sm
+**     ldp     d10, d11, \[sp, #?16\]
+**     ldp     d12, d13, \[sp, #?32\]
+**     ldp     d14, d15, \[sp, #?48\]
+**     ldp     d8, d9, \[sp\], #?64
+**     ret
+*/
+void __attribute__((arm_locally_streaming))
+n_ls ()
+{
+  asm ("");
+}
+
+/*
+** s_ls:
+**     ret
+*/
+void __attribute__((arm_streaming, arm_locally_streaming))
+s_ls ()
+{
+  asm ("");
+}
+
+/*
+** sc_ls:
+**     stp     x29, x30, \[sp, #?-96\]!
+**     mov     x29, sp
+**     stp     d8, d9, \[sp, #?32\]
+**     stp     d10, d11, \[sp, #?48\]
+**     stp     d12, d13, \[sp, #?64\]
+**     stp     d14, d15, \[sp, #?80\]
+**     mrs     x16, svcr
+**     str     x16, \[x29, #?16\]
+**     tbnz    x16, 0, .*
+**     smstart sm
+**     ldr     x16, \[x29, #?16\]
+**     tbnz    x16, 0, .*
+**     smstop  sm
+**     ldp     d8, d9, \[sp, #?32\]
+**     ldp     d10, d11, \[sp, #?48\]
+**     ldp     d12, d13, \[sp, #?64\]
+**     ldp     d14, d15, \[sp, #?80\]
+**     ldp     x29, x30, \[sp\], #?96
+**     ret
+*/
+void __attribute__((arm_streaming_compatible, arm_locally_streaming))
+sc_ls ()
+{
+  asm ("");
+}
+
+/*
+** n_ls_new_za:
+**     str     x30, \[sp, #?-80\]!
+**     stp     d8, d9, \[sp, #?16\]
+**     stp     d10, d11, \[sp, #?32\]
+**     stp     d12, d13, \[sp, #?48\]
+**     stp     d14, d15, \[sp, #?64\]
+**     mrs     x11, tpidr2_el0
+**     cbz     x11, .*
+**     bl      __arm_tpidr2_save
+**     msr     tpidr2_el0, xzr
+**     smstart
+**     bl      consume_za
+**     smstop
+**     ldp     d8, d9, \[sp, #?16\]
+**     ldp     d10, d11, \[sp, #?32\]
+**     ldp     d12, d13, \[sp, #?48\]
+**     ldp     d14, d15, \[sp, #?64\]
+**     ldr     x30, \[sp\], #?80
+**     ret
+*/
+void __attribute__((arm_locally_streaming, arm_new_za))
+n_ls_new_za ()
+{
+  consume_za ();
+}
+
+/*
+** s_ls_new_za:
+**     str     x30, \[sp, #?-16\]!
+**     mrs     x11, tpidr2_el0
+**     cbz     x11, .*
+**     bl      __arm_tpidr2_save
+**     msr     tpidr2_el0, xzr
+**     smstart za
+**     bl      consume_za
+**     smstop  za
+**     ldr     x30, \[sp\], #?16
+**     ret
+*/
+void __attribute__((arm_locally_streaming, arm_streaming, arm_new_za))
+s_ls_new_za ()
+{
+  consume_za ();
+}
+
+/*
+** sc_ls_new_za:
+**     stp     x29, x30, \[sp, #?-96\]!
+**     mov     x29, sp
+**     stp     d8, d9, \[sp, #?32\]
+**     stp     d10, d11, \[sp, #?48\]
+**     stp     d12, d13, \[sp, #?64\]
+**     stp     d14, d15, \[sp, #?80\]
+**     mrs     x11, tpidr2_el0
+**     cbz     x11, .*
+**     bl      __arm_tpidr2_save
+**     msr     tpidr2_el0, xzr
+**     smstart za
+**     mrs     x16, svcr
+**     str     x16, \[x29, #?16\]
+**     tbnz    x16, 0, .*
+**     smstart sm
+**     bl      consume_za
+**     ldr     x16, \[x29, #?16\]
+**     tbnz    x16, 0, .*
+**     smstop  sm
+**     smstop  za
+**     ldp     d8, d9, \[sp, #?32\]
+**     ldp     d10, d11, \[sp, #?48\]
+**     ldp     d12, d13, \[sp, #?64\]
+**     ldp     d14, d15, \[sp, #?80\]
+**     ldp     x29, x30, \[sp\], #?96
+**     ret
+*/
+void __attribute__((arm_streaming_compatible, arm_locally_streaming, 
arm_new_za))
+sc_ls_new_za ()
+{
+  consume_za ();
+}
+
+/*
+** n_ls_shared_za:
+**     str     x30, \[sp, #?-80\]!
+**     stp     d8, d9, \[sp, #?16\]
+**     stp     d10, d11, \[sp, #?32\]
+**     stp     d12, d13, \[sp, #?48\]
+**     stp     d14, d15, \[sp, #?64\]
+**     smstart sm
+**     bl      consume_za
+**     smstop  sm
+**     ldp     d8, d9, \[sp, #?16\]
+**     ldp     d10, d11, \[sp, #?32\]
+**     ldp     d12, d13, \[sp, #?48\]
+**     ldp     d14, d15, \[sp, #?64\]
+**     ldr     x30, \[sp\], #?80
+**     ret
+*/
+void __attribute__((arm_locally_streaming, arm_shared_za))
+n_ls_shared_za ()
+{
+  consume_za ();
+}
+
+/*
+** s_ls_shared_za:
+**     str     x30, \[sp, #?-16\]!
+**     bl      consume_za
+**     ldr     x30, \[sp\], #?16
+**     ret
+*/
+void __attribute__((arm_streaming, arm_locally_streaming, arm_shared_za))
+s_ls_shared_za ()
+{
+  consume_za ();
+}
+
+/*
+** sc_ls_shared_za:
+**     stp     x29, x30, \[sp, #?-96\]!
+**     mov     x29, sp
+**     stp     d8, d9, \[sp, #?32\]
+**     stp     d10, d11, \[sp, #?48\]
+**     stp     d12, d13, \[sp, #?64\]
+**     stp     d14, d15, \[sp, #?80\]
+**     mrs     x16, svcr
+**     str     x16, \[x29, #?16\]
+**     tbnz    x16, 0, .*
+**     smstart sm
+**     bl      consume_za
+**     ldr     x16, \[x29, #?16\]
+**     tbnz    x16, 0, .*
+**     smstop  sm
+**     ldp     d8, d9, \[sp, #?32\]
+**     ldp     d10, d11, \[sp, #?48\]
+**     ldp     d12, d13, \[sp, #?64\]
+**     ldp     d14, d15, \[sp, #?80\]
+**     ldp     x29, x30, \[sp\], #?96
+**     ret
+*/
+void __attribute__((arm_streaming_compatible, arm_locally_streaming, 
arm_shared_za))
+sc_ls_shared_za ()
+{
+  consume_za ();
+}
+
+/*
+** n_ls_vector_pcs:
+**     stp     q8, q9, \[sp, #?-256\]!
+**     stp     q10, q11, \[sp, #?32\]
+**     stp     q12, q13, \[sp, #?64\]
+**     stp     q14, q15, \[sp, #?96\]
+**     stp     q16, q17, \[sp, #?128\]
+**     stp     q18, q19, \[sp, #?160\]
+**     stp     q20, q21, \[sp, #?192\]
+**     stp     q22, q23, \[sp, #?224\]
+**     smstart sm
+**     smstop  sm
+**     ldp     q10, q11, \[sp, #?32\]
+**     ldp     q12, q13, \[sp, #?64\]
+**     ldp     q14, q15, \[sp, #?96\]
+**     ldp     q16, q17, \[sp, #?128\]
+**     ldp     q18, q19, \[sp, #?160\]
+**     ldp     q20, q21, \[sp, #?192\]
+**     ldp     q22, q23, \[sp, #?224\]
+**     ldp     q8, q9, \[sp\], #?256
+**     ret
+*/
+void __attribute__((arm_locally_streaming, aarch64_vector_pcs))
+n_ls_vector_pcs ()
+{
+  asm ("");
+}
+
+/*
+** n_ls_sve_pcs:
+**     addsvl  sp, sp, #-18
+**     str     p4, \[sp\]
+**     str     p5, \[sp, #1, mul vl\]
+**     str     p6, \[sp, #2, mul vl\]
+**     str     p7, \[sp, #3, mul vl\]
+**     str     p8, \[sp, #4, mul vl\]
+**     str     p9, \[sp, #5, mul vl\]
+**     str     p10, \[sp, #6, mul vl\]
+**     str     p11, \[sp, #7, mul vl\]
+**     str     p12, \[sp, #8, mul vl\]
+**     str     p13, \[sp, #9, mul vl\]
+**     str     p14, \[sp, #10, mul vl\]
+**     str     p15, \[sp, #11, mul vl\]
+**     str     z8, \[sp, #2, mul vl\]
+**     str     z9, \[sp, #3, mul vl\]
+**     str     z10, \[sp, #4, mul vl\]
+**     str     z11, \[sp, #5, mul vl\]
+**     str     z12, \[sp, #6, mul vl\]
+**     str     z13, \[sp, #7, mul vl\]
+**     str     z14, \[sp, #8, mul vl\]
+**     str     z15, \[sp, #9, mul vl\]
+**     str     z16, \[sp, #10, mul vl\]
+**     str     z17, \[sp, #11, mul vl\]
+**     str     z18, \[sp, #12, mul vl\]
+**     str     z19, \[sp, #13, mul vl\]
+**     str     z20, \[sp, #14, mul vl\]
+**     str     z21, \[sp, #15, mul vl\]
+**     str     z22, \[sp, #16, mul vl\]
+**     str     z23, \[sp, #17, mul vl\]
+**     addvl   sp, sp, #-1
+**     str     p0, \[sp\]
+**     smstart sm
+**     ldr     p0, \[sp\]
+**     addvl   sp, sp, #1
+**     smstop  sm
+**     ldr     z8, \[sp, #2, mul vl\]
+**     ldr     z9, \[sp, #3, mul vl\]
+**     ldr     z10, \[sp, #4, mul vl\]
+**     ldr     z11, \[sp, #5, mul vl\]
+**     ldr     z12, \[sp, #6, mul vl\]
+**     ldr     z13, \[sp, #7, mul vl\]
+**     ldr     z14, \[sp, #8, mul vl\]
+**     ldr     z15, \[sp, #9, mul vl\]
+**     ldr     z16, \[sp, #10, mul vl\]
+**     ldr     z17, \[sp, #11, mul vl\]
+**     ldr     z18, \[sp, #12, mul vl\]
+**     ldr     z19, \[sp, #13, mul vl\]
+**     ldr     z20, \[sp, #14, mul vl\]
+**     ldr     z21, \[sp, #15, mul vl\]
+**     ldr     z22, \[sp, #16, mul vl\]
+**     ldr     z23, \[sp, #17, mul vl\]
+**     ldr     p4, \[sp\]
+**     ldr     p5, \[sp, #1, mul vl\]
+**     ldr     p6, \[sp, #2, mul vl\]
+**     ldr     p7, \[sp, #3, mul vl\]
+**     ldr     p8, \[sp, #4, mul vl\]
+**     ldr     p9, \[sp, #5, mul vl\]
+**     ldr     p10, \[sp, #6, mul vl\]
+**     ldr     p11, \[sp, #7, mul vl\]
+**     ldr     p12, \[sp, #8, mul vl\]
+**     ldr     p13, \[sp, #9, mul vl\]
+**     ldr     p14, \[sp, #10, mul vl\]
+**     ldr     p15, \[sp, #11, mul vl\]
+**     addsvl  sp, sp, #18
+**     ret
+*/
+void __attribute__((arm_locally_streaming))
+n_ls_sve_pcs (__SVBool_t x)
+{
+  asm ("");
+}
+
+/*
+** n_ls_v0:
+**     addsvl  sp, sp, #-1
+**     ...
+**     smstart sm
+**     add     x[0-9]+, .*
+**     smstop  sm
+**     ...
+**     addsvl  sp, sp, #1
+**     ...
+*/
+#define TEST(VN) __SVInt32_t VN; asm ("" :: "r" (&VN));
+void __attribute__((arm_locally_streaming))
+n_ls_v0 ()
+{
+  TEST (v0);
+}
+
+/*
+** n_ls_v32:
+**     addsvl  sp, sp, #-32
+**     ...
+**     smstart sm
+**     ...
+**     smstop  sm
+**     ...
+**     rdsvl   (x[0-9]+), #1
+**     lsl     (x[0-9]+), \1, #?5
+**     add     sp, sp, \2
+**     ...
+*/
+void __attribute__((arm_locally_streaming))
+n_ls_v32 ()
+{
+  TEST (v0);
+  TEST (v1);
+  TEST (v2);
+  TEST (v3);
+  TEST (v4);
+  TEST (v5);
+  TEST (v6);
+  TEST (v7);
+  TEST (v8);
+  TEST (v9);
+  TEST (v10);
+  TEST (v11);
+  TEST (v12);
+  TEST (v13);
+  TEST (v14);
+  TEST (v15);
+  TEST (v16);
+  TEST (v17);
+  TEST (v18);
+  TEST (v19);
+  TEST (v20);
+  TEST (v21);
+  TEST (v22);
+  TEST (v23);
+  TEST (v24);
+  TEST (v25);
+  TEST (v26);
+  TEST (v27);
+  TEST (v28);
+  TEST (v29);
+  TEST (v30);
+  TEST (v31);
+}
+
+/*
+** n_ls_v33:
+**     rdsvl   (x[0-9]+), #1
+**     mov     (x[0-9]+), #?33
+**     mul     (x[0-9]+), (?:\1, \2|\2, \1)
+**     sub     sp, sp, \3
+**     ...
+**     smstart sm
+**     ...
+**     smstop  sm
+**     ...
+**     rdsvl   (x[0-9]+), #1
+**     mov     (x[0-9]+), #?33
+**     mul     (x[0-9]+), (?:\4, \5|\5, \4)
+**     add     sp, sp, \6
+**     ...
+*/
+void __attribute__((arm_locally_streaming))
+n_ls_v33 ()
+{
+  TEST (v0);
+  TEST (v1);
+  TEST (v2);
+  TEST (v3);
+  TEST (v4);
+  TEST (v5);
+  TEST (v6);
+  TEST (v7);
+  TEST (v8);
+  TEST (v9);
+  TEST (v10);
+  TEST (v11);
+  TEST (v12);
+  TEST (v13);
+  TEST (v14);
+  TEST (v15);
+  TEST (v16);
+  TEST (v17);
+  TEST (v18);
+  TEST (v19);
+  TEST (v20);
+  TEST (v21);
+  TEST (v22);
+  TEST (v23);
+  TEST (v24);
+  TEST (v25);
+  TEST (v26);
+  TEST (v27);
+  TEST (v28);
+  TEST (v29);
+  TEST (v30);
+  TEST (v31);
+  TEST (v32);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_2.c 
b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_2.c
new file mode 100644
index 00000000000..4c9caf5d078
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_2.c
@@ -0,0 +1,177 @@
+// { dg-options "-O -fomit-frame-pointer -fno-optimize-sibling-calls" }
+// { dg-final { check-function-bodies "**" "" } }
+
+#include <arm_neon.h>
+#include <arm_sve.h>
+
+/*
+** test_d0:
+**     ...
+**     smstart sm
+**     .*
+**     fmov    x10, d0
+**     smstop  sm
+**     fmov    d0, x10
+**     ...
+*/
+double __attribute__((arm_locally_streaming))
+test_d0 ()
+{
+  asm ("");
+  return 1.0f;
+}
+
+/*
+** test_d0_vec:
+**     ...
+**     smstart sm
+**     .*
+** (
+**     fmov    x10, d0
+** |
+**     umov    x10, v0.d\[0\]
+** )
+**     smstop  sm
+**     fmov    d0, x10
+**     ...
+*/
+int8x8_t __attribute__((arm_locally_streaming))
+test_d0_vec ()
+{
+  asm volatile ("");
+  return (int8x8_t) {};
+}
+
+/*
+** test_q0:
+**     ...
+**     smstart sm
+**     .*
+**     str     q0, \[sp, #?-16\]!
+**     smstop  sm
+**     ldr     q0, \[sp\], #?16
+**     ...
+*/
+int8x16_t __attribute__((arm_locally_streaming))
+test_q0 ()
+{
+  asm volatile ("");
+  return (int8x16_t) {};
+}
+
+/*
+** test_q1:
+**     ...
+**     smstart sm
+**     ...
+**     stp     q0, q1, \[sp, #?-32\]!
+**     smstop  sm
+**     ldp     q0, q1, \[sp\], #?32
+**     ...
+*/
+int8x16x2_t __attribute__((arm_locally_streaming))
+test_q1 ()
+{
+  asm volatile ("");
+  return (int8x16x2_t) {};
+}
+
+/*
+** test_q2:
+**     ...
+**     smstart sm
+**     ...
+**     stp     q0, q1, \[sp, #?-48\]!
+**     str     q2, \[sp, #?32\]
+**     smstop  sm
+**     ldr     q2, \[sp, #?32\]
+**     ldp     q0, q1, \[sp\], #?48
+**     ...
+*/
+int8x16x3_t __attribute__((arm_locally_streaming))
+test_q2 ()
+{
+  asm volatile ("");
+  return (int8x16x3_t) {};
+}
+
+/*
+** test_q3:
+**     ...
+**     smstart sm
+**     ...
+**     stp     q0, q1, \[sp, #?-64\]!
+**     stp     q2, q3, \[sp, #?32\]
+**     smstop  sm
+**     ldp     q2, q3, \[sp, #?32\]
+**     ldp     q0, q1, \[sp\], #?64
+**     ...
+*/
+int8x16x4_t __attribute__((arm_locally_streaming))
+test_q3 ()
+{
+  asm volatile ("");
+  return (int8x16x4_t) {};
+}
+
+/*
+** test_z0:
+**     ...
+**     smstart sm
+**     mov     z0\.b, #0
+**     addvl   sp, sp, #-1
+**     str     z0, \[sp\]
+**     smstop  sm
+**     ldr     z0, \[sp\]
+**     addvl   sp, sp, #1
+**     ...
+*/
+svint8_t __attribute__((arm_locally_streaming))
+test_z0 ()
+{
+  asm volatile ("");
+  return (svint8_t) {};
+}
+
+/*
+** test_z3:
+**     ...
+**     smstart sm
+**     ...
+**     addvl   sp, sp, #-4
+**     str     z0, \[sp\]
+**     str     z1, \[sp, #1, mul vl\]
+**     str     z2, \[sp, #2, mul vl\]
+**     str     z3, \[sp, #3, mul vl\]
+**     smstop  sm
+**     ldr     z0, \[sp\]
+**     ldr     z1, \[sp, #1, mul vl\]
+**     ldr     z2, \[sp, #2, mul vl\]
+**     ldr     z3, \[sp, #3, mul vl\]
+**     ...
+*/
+svint8x4_t __attribute__((arm_locally_streaming))
+test_z3 ()
+{
+  asm volatile ("");
+  return (svint8x4_t) {};
+}
+
+/*
+** test_p0:
+**     ...
+**     smstart sm
+**     pfalse  p0\.b
+**     addvl   sp, sp, #-1
+**     str     p0, \[sp\]
+**     smstop  sm
+**     ldr     p0, \[sp\]
+**     addvl   sp, sp, #1
+**     ...
+*/
+svbool_t __attribute__((arm_locally_streaming))
+test_p0 ()
+{
+  asm volatile ("");
+  return (svbool_t) {};
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_3.c 
b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_3.c
new file mode 100644
index 00000000000..e6cbd9d176d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_3.c
@@ -0,0 +1,273 @@
+// { dg-options "-O -fomit-frame-pointer -fno-optimize-sibling-calls" }
+// { dg-final { check-function-bodies "**" "" } }
+
+#include <arm_neon.h>
+#include <arm_sve.h>
+
+/*
+** test_d0:
+**     ...
+**     fmov    x10, d0
+**     smstart sm
+**     fmov    d0, x10
+**     smstop  sm
+**     ...
+*/
+void __attribute__((arm_locally_streaming))
+test_d0 (double d0)
+{
+  asm ("");
+}
+
+/*
+** test_d7:
+**     ...
+**     fmov    x10, d0
+**     fmov    x11, d1
+**     fmov    x12, d2
+**     fmov    x13, d3
+**     fmov    x14, d4
+**     fmov    x15, d5
+**     fmov    x16, d6
+**     fmov    x17, d7
+**     smstart sm
+**     fmov    d0, x10
+**     fmov    d1, x11
+**     fmov    d2, x12
+**     fmov    d3, x13
+**     fmov    d4, x14
+**     fmov    d5, x15
+**     fmov    d6, x16
+**     fmov    d7, x17
+**     smstop  sm
+**     ...
+*/
+void __attribute__((arm_locally_streaming))
+test_d7 (double d0, double d1, double d2, double d3,
+        double d4, double d5, double d6, double d7)
+{
+  asm volatile ("");
+}
+
+/*
+** test_d0_vec:
+**     ...
+** (
+**     fmov    x10, d0
+** |
+**     umov    x10, v0.d\[0\]
+** )
+**     smstart sm
+**     fmov    d0, x10
+**     smstop  sm
+**     ...
+*/
+void __attribute__((arm_locally_streaming))
+test_d0_vec (int8x8_t d0)
+{
+  asm volatile ("");
+}
+
+/*
+** test_d7_vec:
+**     ...
+** (
+**     fmov    x10, d0
+**     fmov    x11, d1
+**     fmov    x12, d2
+**     fmov    x13, d3
+**     fmov    x14, d4
+**     fmov    x15, d5
+**     fmov    x16, d6
+**     fmov    x17, d7
+** |
+**     umov    x10, v0.d\[0\]
+**     umov    x11, v1.d\[0\]
+**     umov    x12, v2.d\[0\]
+**     umov    x13, v3.d\[0\]
+**     umov    x14, v4.d\[0\]
+**     umov    x15, v5.d\[0\]
+**     umov    x16, v6.d\[0\]
+**     umov    x17, v7.d\[0\]
+** )
+**     smstart sm
+**     fmov    d0, x10
+**     fmov    d1, x11
+**     fmov    d2, x12
+**     fmov    d3, x13
+**     fmov    d4, x14
+**     fmov    d5, x15
+**     fmov    d6, x16
+**     fmov    d7, x17
+**     smstop  sm
+**     ...
+*/
+void __attribute__((arm_locally_streaming))
+test_d7_vec (int8x8_t d0, int8x8_t d1, int8x8_t d2, int8x8_t d3,
+            int8x8_t d4, int8x8_t d5, int8x8_t d6, int8x8_t d7)
+{
+  asm volatile ("");
+}
+
+/*
+** test_q0:
+**     ...
+**     str     q0, \[sp, #?-16\]!
+**     smstart sm
+**     ldr     q0, \[sp\], #?16
+**     smstop  sm
+**     ...
+*/
+void __attribute__((arm_locally_streaming))
+test_q0 (int8x16_t q0)
+{
+  asm volatile ("");
+}
+
+/*
+** test_q7:
+**     ...
+**     stp     q0, q1, \[sp, #?-128\]!
+**     stp     q2, q3, \[sp, #?32\]
+**     stp     q4, q5, \[sp, #?64\]
+**     stp     q6, q7, \[sp, #?96\]
+**     smstart sm
+**     ldp     q2, q3, \[sp, #?32\]
+**     ldp     q4, q5, \[sp, #?64\]
+**     ldp     q6, q7, \[sp, #?96\]
+**     ldp     q0, q1, \[sp\], #?128
+**     smstop  sm
+**     ...
+*/
+void __attribute__((arm_locally_streaming))
+test_q7 (int8x16x4_t q0, int8x16x4_t q4)
+{
+  asm volatile ("");
+}
+
+/*
+** test_z0:
+**     ...
+**     addvl   sp, sp, #-1
+**     str     z0, \[sp\]
+**     smstart sm
+**     ldr     z0, \[sp\]
+**     addvl   sp, sp, #1
+**     smstop  sm
+**     ...
+*/
+void __attribute__((arm_locally_streaming))
+test_z0 (svint8_t z0)
+{
+  asm volatile ("");
+}
+
+/*
+** test_z7:
+**     ...
+**     addvl   sp, sp, #-8
+**     str     z0, \[sp\]
+**     str     z1, \[sp, #1, mul vl\]
+**     str     z2, \[sp, #2, mul vl\]
+**     str     z3, \[sp, #3, mul vl\]
+**     str     z4, \[sp, #4, mul vl\]
+**     str     z5, \[sp, #5, mul vl\]
+**     str     z6, \[sp, #6, mul vl\]
+**     str     z7, \[sp, #7, mul vl\]
+**     smstart sm
+**     ldr     z0, \[sp\]
+**     ldr     z1, \[sp, #1, mul vl\]
+**     ldr     z2, \[sp, #2, mul vl\]
+**     ldr     z3, \[sp, #3, mul vl\]
+**     ldr     z4, \[sp, #4, mul vl\]
+**     ldr     z5, \[sp, #5, mul vl\]
+**     ldr     z6, \[sp, #6, mul vl\]
+**     ldr     z7, \[sp, #7, mul vl\]
+**     addvl   sp, sp, #8
+**     smstop  sm
+**     ...
+*/
+void __attribute__((arm_locally_streaming))
+test_z7 (svint8x4_t z0, svint8x4_t z4)
+{
+  asm volatile ("");
+}
+
+/*
+** test_p0:
+**     ...
+**     addvl   sp, sp, #-1
+**     str     p0, \[sp\]
+**     smstart sm
+**     ldr     p0, \[sp\]
+**     addvl   sp, sp, #1
+**     smstop  sm
+**     ...
+*/
+void __attribute__((arm_locally_streaming))
+test_p0 (svbool_t p0)
+{
+  asm volatile ("");
+}
+
+/*
+** test_p3:
+**     ...
+**     addvl   sp, sp, #-1
+**     str     p0, \[sp\]
+**     str     p1, \[sp, #1, mul vl\]
+**     str     p2, \[sp, #2, mul vl\]
+**     str     p3, \[sp, #3, mul vl\]
+**     smstart sm
+**     ldr     p0, \[sp\]
+**     ldr     p1, \[sp, #1, mul vl\]
+**     ldr     p2, \[sp, #2, mul vl\]
+**     ldr     p3, \[sp, #3, mul vl\]
+**     addvl   sp, sp, #1
+**     smstop  sm
+**     ...
+*/
+void __attribute__((arm_locally_streaming))
+test_p3 (svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
+{
+  asm volatile ("");
+}
+
+/*
+** test_mixed:
+**     ...
+**     addvl   sp, sp, #-3
+**     str     p0, \[sp\]
+**     str     p1, \[sp, #1, mul vl\]
+**     str     p2, \[sp, #2, mul vl\]
+**     str     p3, \[sp, #3, mul vl\]
+**     str     z3, \[sp, #1, mul vl\]
+**     str     z7, \[sp, #2, mul vl\]
+**     stp     q2, q6, \[sp, #?-32\]!
+**     fmov    w10, s0
+**     fmov    x11, d1
+**     fmov    w12, s4
+**     fmov    x13, d5
+**     smstart sm
+**     fmov    s0, w10
+**     fmov    d1, x11
+**     fmov    s4, w12
+**     fmov    d5, x13
+**     ldp     q2, q6, \[sp\], #?32
+**     ldr     p0, \[sp\]
+**     ldr     p1, \[sp, #1, mul vl\]
+**     ldr     p2, \[sp, #2, mul vl\]
+**     ldr     p3, \[sp, #3, mul vl\]
+**     ldr     z3, \[sp, #1, mul vl\]
+**     ldr     z7, \[sp, #2, mul vl\]
+**     addvl   sp, sp, #3
+**     smstop  sm
+**     ...
+*/
+void __attribute__((arm_locally_streaming))
+test_mixed (float s0, double d1, float32x4_t q2, svfloat32_t z3,
+           float s4, double d5, float64x2_t q6, svfloat64_t z7,
+           svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
+{
+  asm volatile ("");
+}
-- 
2.25.1


Reply via email to