(tvm) branch main updated: [SME] Introduce scalable fp32 dense schedule (#16921)

lukhut Wed, 15 May 2024 03:28:31 -0700

This is an automated email from the ASF dual-hosted git repository.

lukhut pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git



The following commit(s) were added to refs/heads/main by this push:
     new b49468ddf1 [SME] Introduce scalable fp32 dense schedule (#16921)
b49468ddf1 is described below

commit b49468ddf11a1103d82f11009a0b3253a49705aa
Author: Luke Hutton <luke.hut...@arm.com>
AuthorDate: Wed May 15 11:28:16 2024 +0100

    [SME] Introduce scalable fp32 dense schedule (#16921)
    
    This commit adds a new scalable fp32 dense schedule that calls SME 
intrinsics according to the SME RFC: 
https://github.com/apache/tvm-rfcs/pull/107.
    
    Currently the schedule does not make use of predication, meaning the output 
from the matmul compute must be copied in a subsequent compute stage. This will 
be removed once support for predication is added.
---
 python/tvm/micro/testing/aot_test_utils.py         |  10 +
 python/tvm/relay/op/strategy/arm_cpu.py            |  69 +++-
 python/tvm/testing/utils.py                        |  17 +
 python/tvm/tir/tensor_intrin/__init__.py           |   1 -
 python/tvm/tir/tensor_intrin/arm_cpu.py            | 362 ++++++++++++++++++++-
 python/tvm/topi/arm_cpu/__init__.py                |   5 +-
 python/tvm/topi/arm_cpu/arm_utils.py               |  26 ++
 python/tvm/topi/arm_cpu/dense.py                   |  10 +-
 python/tvm/topi/arm_cpu/dense_alter_op.py          |  75 +++++
 python/tvm/topi/arm_cpu/matmul.py                  | 124 +++++++
 python/tvm/topi/x86/dense_alter_op.py              |   2 +-
 src/arith/const_int_bound.cc                       |   2 +-
 src/relay/backend/te_compiler_cache.cc             |   4 +-
 src/relay/op/nn/nn.cc                              |   1 +
 src/tir/schedule/ir_comparator.cc                  |   6 +-
 .../python/codegen/test_target_codegen_aarch64.py  |  46 ++-
 tests/python/integration/test_arm_aprofile.py      |  94 ------
 ...est_meta_schedule_postproc_rewrite_tensorize.py |   2 +-
 .../relay/strategy/arm_cpu/scalable_utils.py       |  53 +++
 .../arm_cpu/{test_dense_dsp.py => test_dense.py}   |  91 +++++-
 tests/python/relay/strategy/arm_cpu/test_matmul.py | 118 +++++++
 .../relay/strategy/test_select_implementation.py   |  55 +++-
 tests/python/relay/test_pass_alter_op_layout.py    |  56 ++++
 tests/python/topi/test_topi_matmul.py              |  20 +-
 24 files changed, 1127 insertions(+), 122 deletions(-)

diff --git a/python/tvm/micro/testing/aot_test_utils.py 
b/python/tvm/micro/testing/aot_test_utils.py
index 06cd0f1c9e..991a3f0ddb 100644
--- a/python/tvm/micro/testing/aot_test_utils.py
+++ b/python/tvm/micro/testing/aot_test_utils.py
@@ -65,6 +65,16 @@ AOT_USMP_CORSTONE300_RUNNER = AOTTestRunner(
     },
 )
 
+AOT_APROFILE_AEM_RUNNER = AOTTestRunner(
+    makefile="aprofile_aem",
+    includes=[],
+    pass_config={
+        "tir.usmp.enable": False,
+        # AOT test infra generates 'fake' tensor inputs which fails asserts
+        "tir.disable_assert": True,
+    },
+)
+
 
 def parametrize_aot_options(test):
     """Parametrize over valid option combinations"""
diff --git a/python/tvm/relay/op/strategy/arm_cpu.py 
b/python/tvm/relay/op/strategy/arm_cpu.py
index 2fc148c3ef..9974d2691d 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -21,7 +21,9 @@ import logging
 # pylint: 
disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import
 import re
 
+import tvm
 from tvm import relay, topi, tir
+from tvm.tir.schedule.analysis import has_block
 
 from ....auto_scheduler import is_auto_scheduler_enabled
 from ....meta_schedule import is_meta_schedule_enabled
@@ -639,7 +641,7 @@ def schedule_bitserial_dense_arm_cpu(attrs, inputs, 
out_type, target):
 def schedule_dense_arm_cpu(attrs, inputs, out_type, target):
     """dense arm cpu strategy"""
     strategy = _op.OpStrategy()
-    data, _ = inputs
+    data, weight = inputs
 
     if target.features.has_dsp and data.dtype in ["int8", "int16"]:
         strategy.add_implementation(
@@ -680,6 +682,23 @@ def schedule_dense_arm_cpu(attrs, inputs, out_type, 
target):
             plevel=11,
         )
 
+    if (
+        target.features.has_sme
+        and data.dtype in ["float32"]
+        and weight.dtype in ["float32"]
+        and out_type.dtype in ["float32"]
+        # The schedule uses tensorization which does not work when the
+        # reduction axis has unit iters. See
+        # https://github.com/apache/tvm/issues/16566
+        and data.shape[1] > 1
+    ):
+        strategy.add_implementation(
+            wrap_compute_dense(topi.arm_cpu.compute_matmul_sme),
+            lambda: None,
+            name="matmul.arm_cpu.sme",
+            plevel=12,
+        )
+
     # Fallback to x86 schedules as there is currently no arm_cpu schedule for 
dense
     strategy.add_implementation(
         wrap_compute_dense(topi.x86.dense_nopack),
@@ -697,6 +716,40 @@ def schedule_dense_arm_cpu(attrs, inputs, out_type, 
target):
     return strategy
 
 
+@matmul_strategy.register("arm_cpu")
+def matmul_strategy_arm_cpu(attrs, inputs, out_type, target):
+    """matmul arm cpu strategy"""
+    strategy = _op.OpStrategy()
+    data, weight = inputs
+
+    if (
+        target.features.has_sme
+        and data.dtype in ["float32"]
+        and weight.dtype in ["float32"]
+        and out_type.dtype in ["float32"]
+        and not (attrs.transpose_a or attrs.transpose_b)
+        and len(data.shape) == 2
+        # The schedule uses tensorization which does not work when the
+        # reduction axis has unit iters. See
+        # https://github.com/apache/tvm/issues/16566
+        and data.shape[1] > 1
+    ):
+        # Ideally we should check that weight is a Relay constant, but 
strategy functions
+        # don't have access to the data needed to check this.
+        strategy.add_implementation(
+            wrap_compute_matmul(topi.arm_cpu.compute_matmul_sme),
+            lambda: None,
+            name="matmul.arm_cpu.sme",
+        )
+        return strategy
+
+    logger.warning("matmul is not optimized for arm cpu.")
+    strategy.add_implementation(
+        wrap_compute_matmul(topi.nn.matmul), naive_schedule, 
name="matmul.generic"
+    )
+    return strategy
+
+
 @conv1d_strategy.register("arm_cpu")
 def conv1d_strategy_arm_cpu(attrs, inputs, out_type, target):
     """conv1d strategy"""
@@ -737,3 +790,17 @@ def conv1d_strategy_arm_cpu(attrs, inputs, out_type, 
target):
             f"Unsupported kernel layout {kernel_layout} for conv1d {layout} 
for arm cpu."
         )
     return strategy
+
+
+def arm_cpu_tir_strategy(sch: tir.Schedule) -> bool:
+    """
+    Strategy for arm_cpu STIR schedules.
+    """
+    current_target = tvm.target.Target.current()
+
+    if current_target.features.has_sme and has_block(sch, "matmul_sme_gemm"):
+        topi.arm_cpu.matmul.tir_schedule_matmul_sme(sch)
+        return True
+
+    # Fallback to TE schedule for operators we have not written a special TIR 
schedule for
+    return False
diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index ac22af2823..38b39b5fc2 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -1023,6 +1023,19 @@ requires_corstone300 = Feature(
     parent_features="cmsisnn",
 )
 
+
+def _aprofile_aem_fvp_compile_time_check():
+    if shutil.which("FVP_Base_RevC-2xAEMvA") is None:
+        return "AProfile AEM is not available"
+    return True
+
+
+requires_aprofile_aem_fvp = Feature(
+    "aprofile-aem-fvp",
+    "AProfile AEM FVP",
+    compile_time_check=_aprofile_aem_fvp_compile_time_check,
+)
+
 # Mark a test as requiring Vitis AI to run
 requires_vitis_ai = Feature("vitis_ai", "Vitis AI", cmake_flag="USE_VITIS_AI")
 
@@ -1205,6 +1218,10 @@ def skip_if_32bit(reason):
     return decorator
 
 
+def skip_if_no_reference_system(func):
+    return skip_if_32bit(reason="Reference system unavailable in i386 
container")(func)
+
+
 def requires_package(*packages):
     """Mark a test as requiring python packages to run.
 
diff --git a/python/tvm/tir/tensor_intrin/__init__.py 
b/python/tvm/tir/tensor_intrin/__init__.py
index 7e5a26bdeb..d127335e82 100644
--- a/python/tvm/tir/tensor_intrin/__init__.py
+++ b/python/tvm/tir/tensor_intrin/__init__.py
@@ -16,4 +16,3 @@
 # under the License.
 # pylint: disable=unused-import
 """Intrinsics for tensorization."""
-from . import arm_cpu, cuda, rocm, x86, hexagon
diff --git a/python/tvm/tir/tensor_intrin/arm_cpu.py 
b/python/tvm/tir/tensor_intrin/arm_cpu.py
index a5003d41a8..90af1e05b1 100644
--- a/python/tvm/tir/tensor_intrin/arm_cpu.py
+++ b/python/tvm/tir/tensor_intrin/arm_cpu.py
@@ -17,6 +17,10 @@
 # pylint: disable=invalid-name,missing-function-docstring,unused-import
 """Intrinsics for ARM tensorization."""
 from tvm.script import tir as T
+from tvm.script.ir_builder import IRBuilder
+from tvm.script.ir_builder.tir import prim_func as build_prim_func
+from tvm.target.codegen import llvm_version_major
+
 from .. import TensorIntrin
 from .dot_product_common import (
     DP4A_S8S8S32_INTRIN,
@@ -163,15 +167,367 @@ def get_dotprod_intrin(in_dtype, out_dtype):
     return dot_prod_desc, dot_prod_impl
 
 
+def get_sme_transpose_interleave_2svlx2svl_intrin():
+    """
+    Transpose a matrix of size 2SVL x 2SVL (where 'SVL' is the Scalable Vector 
Length) using
+    the Scalable Matrix Extension (SME).
+
+    This is completed by loading rows of the input matrix into the accumulator 
tile,
+    then storing the columns. The SME accumulator tile is divided into a 
series of sub-tiles
+    which must be loaded to / stored from independently.
+
+    Note: currently only supports the fp32 datatype.
+
+    Example
+    -------
+    An example case for float32. In this instance the accumulator tile is 
divided into 4
+    sub-tiles of size SVLxSVL numbered 0-3. We start by loading rows of A, 
each SVL in length,
+    into each of the sub-tiles. In the diagram below, each load for a sub-tile 
is sequenced by
+    a, b, ... till the tile is full.
+
+    The columns of each sub-tile are then stored into A_t. Note that to 
perform a transpose,
+    the contents of sub-tile 1 and 2 are stored in opposite locations - see 
the diagram
+    below.
+
+    A:                                  Accumulator tile:                     
A_t:
+                2SVL                                2SVL                       
        2SVL
+         +----------------+                 +-----------------+                
+-------------------+
+         | --0a--  --1a-- |                 |                 |                
| |  |     |  |     |
+         | --0b--  --1b-- |                 |    0       1    |                
| 0a 0b .. 2a 2b .. |
+         |   ...     ...  | ld1w.horiz      |                 | st1w.vert      
| |  |     |  |     |
+    2SVL | --2a--  --3a-- |   ====>    2SVL |                 |   ====>   2SVL 
| |  |     |  |     |
+         | --2a--  --3b-- |                 |    2       3    |                
| 1a 1b .. 3a 3b .. |
+         |   ...     ...  |                 |                 |                
| |  |     |  |     |
+         +----------------+                 +-----------------+                
+-------------------+
+
+    Returns
+    -------
+    intrin : TensorIntrin
+        The SME TensorIntrin that can be used in tensorizing a schedule.
+
+    """
+    SVF = 4 * T.vscale()
+    SVF2 = 2 * SVF
+
+    @T.prim_func
+    def desc(a: T.handle, a_t: T.handle) -> None:
+        A = T.match_buffer(a, (SVF2, SVF2), dtype="float32", offset_factor=1)
+        A_t = T.match_buffer(a_t, (SVF2, SVF2), dtype="float32", 
offset_factor=1)
+        with T.block("root"):
+            T.reads(A[0:SVF2, 0:SVF2])
+            T.writes(A_t[0:SVF2, 0:SVF2])
+            for k, m in T.grid(SVF2, SVF2):
+                with T.block("transpose"):
+                    v_m, v_k = T.axis.remap("SS", [m, k])
+                    A_t[v_k, v_m] = A[v_m, v_k]
+
+    def impl():
+        # Accumulation sub-tile count. For fp32 it is 4
+        sub_tile_count = 4
+
+        with IRBuilder() as ib:
+            with build_prim_func():
+                a = T.arg("a", T.handle())
+                a_t = T.arg("a_t", T.handle())
+
+                A = T.match_buffer(
+                    a, (SVF2, SVF2), "float32", offset_factor=1, 
strides=[T.int32(), 1]
+                )
+                A_t = T.match_buffer(
+                    a_t,
+                    (SVF2, SVF2),
+                    "float32",
+                    offset_factor=1,
+                    strides=[T.int32(), 1],
+                )
+
+                # Disable predication
+                ptrue = T.broadcast(T.IntImm("int1", 1), T.vscale() * 4)
+
+                with T.block("root"):
+                    T.reads(A[0:SVF2, 0:SVF2])
+                    T.writes(A_t[0:SVF2, 0:SVF2])
+
+                    # Load rows of the input matrix
+                    with T.serial(0, SVF) as slice_idx:
+                        for sub_tile_idx in range(0, sub_tile_count):
+                            row_offset = SVF if sub_tile_idx >= 
(sub_tile_count // 2) else 0
+                            col_offset = SVF if sub_tile_idx % 2 else 0
+                            offset = (slice_idx + row_offset) * A.strides[0] + 
col_offset
+
+                            input_ptr = A.access_ptr("r", offset=offset)
+                            sub_tile = T.int32(sub_tile_idx)
+                            T.evaluate(
+                                T.call_llvm_intrin(
+                                    "void",
+                                    "llvm.aarch64.sme.ld1w.horiz",
+                                    T.uint32(4),
+                                    ptrue,
+                                    input_ptr,
+                                    sub_tile,
+                                    slice_idx,
+                                )
+                            )
+
+                    # Store columns to the ouptut matrix
+                    with T.serial(0, SVF) as slice_idx:
+                        for sub_tile_idx in range(0, sub_tile_count):
+                            col_offset = SVF if sub_tile_idx >= 
(sub_tile_count // 2) else 0
+                            row_offset = SVF if sub_tile_idx % 2 else 0
+                            offset = (slice_idx + row_offset) * A_t.strides[0] 
+ col_offset
+
+                            output_ptr = A_t.access_ptr("w", offset=offset)
+                            sub_tile = T.int32(sub_tile_idx)
+                            T.evaluate(
+                                T.call_llvm_intrin(
+                                    "void",
+                                    "llvm.aarch64.sme.st1w.vert",
+                                    T.uint32(4),
+                                    ptrue,
+                                    output_ptr,
+                                    sub_tile,
+                                    slice_idx,
+                                )
+                            )
+
+        return ib.get()
+
+    return desc, impl()
+
+
+def get_sme_gemm_interleaved_mopa_2svlx2svl_intrin(K):
+    """
+    Compute a GEMM of size 2SVL x 2SVL (where 'SVL' is the Scalable Vector 
Length using
+    outer product operations from the Scalable Matrix Extension (SME).
+
+    The inputs A and B are expected to be of size K x 2SVL and produce a 
result C of
+    size 2SVL x 2SVL.
+
+    The SME accumulator tile is divided into sub-tiles, each of which is 
utilized to
+    calculate the outer-product using columns / rows of A and B respectively. 
For each
+    sub-tile, elements in the first column of input matrix A (accessed 
sequentially due
+    to being transpose-interleaved) and first row of input matrix B are used 
to calculate
+    an outer-product. This is then accumulated with the result of performing an
+    outer-product on the second column and row of A and B respectively. This 
process is
+    repeated K times. Finally, the results of the accumulation are stored.
+
+    Note: The input tensor 'A' must be transpose-interleaved.
+    Note: Currently only supports the fp32 datatype.
+
+    Example
+    -------
+
+    Diagram showing outer-product performed on each of the accumulator 
sub-tiles
+    for the fp32 datatype:
+
+                       SVL           SVL
+                +----------------------------+
+                |       l     |       h      | K
+            K   +----------------------------+
+         +---+  +----------------------------+
+         |   |  |  0:            1:          |-+
+         |   |  |  mopa(l, l)    mopa(l, h)  | |-+
+       l |   |  |                            | | |
+         |   |  |                            | | |
+         |---|  |                            | | |
+         |   |  |  2:            3:          | | |
+       h |   |  |  mopa(h, l)    mopa(h, h)  | | |
+         |   |  |                            | | |
+         |   |  |                            | | |
+         +---+  +----------------------------+ | |
+                  +----------------------------+ |
+                     +---------------------------+
+                                    (accumulate K times)
+
+    Pseudo code computing 2SVL x 2SVL GEMM for fp32 inputs:
+
+    .. code-block:: c
+
+        // Number of fp32 elements in a scalable vector
+        int SVF = SVL / 32;
+
+        // Reset the accumulator tile
+        sme.zero();
+
+        // Calculate outer products and accumulate
+        for (k = 0; k < K; k++) {
+            float32xSVF A_row_0 = A[k][0];
+            float32xSVF A_row_1 = A[k][SVF];
+            float32xSVF B_row_0 = B[k][0];
+            float32xSVF B_row_1 = B[k][SVF];
+
+            float32xSVFxSVF sub_tile_0 += sme.mopa(A_row_0, B_row_0);
+            float32xSVFxSVF sub_tile_1 += sme.mopa(A_row_0, B_row_1);
+            float32xSVFxSVF sub_tile_2 += sme.mopa(A_row_1, B_row_0);
+            float32xSVFxSVF sub_tile_3 += sme.mopa(A_row_1, B_row_1);
+        }
+
+        // Store the results of accumulation
+        for (i = 0; i < SVF; i++) {
+            C[i][0] = sme.horiz(sub_tile_0[i]);
+            C[i][0] = sme.horiz(sub_tile_0[i + SVF]);
+            C[i + SVF][0] = sme.horiz(sub_tile_0[i]);
+            C[i + SVF][0] = sme.horiz(sub_tile_0[i + SVF]);
+        }
+
+    Notes:
+    - Recall that A has been transposed beforehand such that each column is 
now accessed
+      by row.
+    - 'sme.zero' resets the accumulator tile to contain all zero's.
+    - 'sme.mopa' is the outer product and accumulate intrinsic.
+    - 'sme.horiz' stores rows of an accumulator sub-tile to memory.
+
+    Returns
+    -------
+    intrin : TensorIntrin
+        The SME TensorIntrin that can be used in tensorizing a schedule.
+
+    """
+    SVF = 4 * T.vscale()
+    SVF2 = 2 * SVF
+
+    @T.prim_func
+    def desc(a: T.handle, b: T.handle, c: T.handle):
+        A = T.match_buffer(a, (K, SVF2), dtype="float32", offset_factor=1)
+        B = T.match_buffer(b, (K, SVF2), dtype="float32", offset_factor=1)
+        C = T.match_buffer(c, (SVF2, SVF2), dtype="float32", offset_factor=1)
+
+        with T.block("root"):
+            T.reads(C[0:SVF2, 0:SVF2], A[0:K, 0:SVF2], B[0:K, 0:SVF2])
+            T.writes(C[0:SVF2, 0:SVF2])
+            for m, n, k in T.grid(SVF2, SVF2, K):
+                with T.block("gemm"):
+                    v_m, v_n, v_k = T.axis.remap("SSR", [m, n, k])
+                    C[v_m, v_n] += A[v_k, v_m] * B[v_k, v_n]
+
+    def impl():
+        # Accumulation sub-tile count. For fp32 it is 4
+        sub_tile_count = 4
+
+        with IRBuilder() as ib:
+            with build_prim_func():
+                a = T.arg("a", T.handle())
+                b = T.arg("b", T.handle())
+                c = T.arg("c", T.handle())
+
+                A = T.match_buffer(a, (K, SVF2), "float32", offset_factor=1, 
strides=[T.int32(), 1])
+                B = T.match_buffer(b, (K, SVF2), "float32", offset_factor=1, 
strides=[T.int32(), 1])
+                C = T.match_buffer(
+                    c, (SVF2, SVF2), "float32", offset_factor=1, 
strides=[T.int32(), 1]
+                )
+
+                ptrue = T.broadcast(T.IntImm("int1", 1), T.vscale() * 4)
+
+                with T.block("root"):
+                    T.reads(C[0:SVF2, 0:SVF2], A[0:K, 0:SVF2], B[0:K, 0:SVF2])
+                    T.writes(C[0:SVF2, 0:SVF2])
+
+                    # Iterate over the reduction axis applying outer product 
and accumulate
+                    with T.serial(K) as k:
+                        a_low = T.BufferLoad(A, [k, T.Ramp(0, 1, T.vscale() * 
4)])
+                        a_high = T.BufferLoad(A, [k, T.Ramp(SVF, 1, T.vscale() 
* 4)])
+                        b_low = T.BufferLoad(B, [k, T.Ramp(0, 1, T.vscale() * 
4)])
+                        b_high = T.BufferLoad(B, [k, T.Ramp(SVF, 1, T.vscale() 
* 4)])
+
+                        input_combinations = [
+                            (a_low, b_low),
+                            (a_low, b_high),
+                            (a_high, b_low),
+                            (a_high, b_high),
+                        ]
+                        for sub_tile_idx in range(0, sub_tile_count):
+                            sub_tile = T.int32(sub_tile_idx)
+                            input_1 = input_combinations[sub_tile_idx][0]
+                            input_2 = input_combinations[sub_tile_idx][1]
+
+                            T.evaluate(
+                                T.call_llvm_intrin(
+                                    "void",
+                                    "llvm.aarch64.sme.mopa.nxv4f32",
+                                    T.uint32(5),
+                                    sub_tile,
+                                    ptrue,
+                                    ptrue,
+                                    input_1,
+                                    input_2,
+                                )
+                            )
+
+                    # Store the accumulated tile results
+                    with T.serial(SVF) as slice_idx:
+                        for sub_tile_idx in range(sub_tile_count):
+                            vert_offset = SVF if sub_tile_idx >= 
(sub_tile_count // 2) else 0
+                            horiz_offset = SVF if sub_tile_idx % 2 else 0
+                            local_offset = (slice_idx + vert_offset) * 
C.strides[0] + horiz_offset
+                            output_ptr = C.access_ptr("w", 
offset=local_offset, extent=SVF)
+
+                            T.evaluate(
+                                T.call_llvm_intrin(
+                                    "void",
+                                    "llvm.aarch64.sme.st1w.horiz",
+                                    T.uint32(4),
+                                    ptrue,
+                                    output_ptr,
+                                    T.int32(sub_tile_idx),
+                                    T.int32(slice_idx),
+                                )
+                            )
+
+            return ib.get()
+
+    return desc, impl()
+
+
+def get_sme_init_intrin():
+    """
+    Reset the entire matrix tile storage to 0.
+    """
+    SVF2 = 2 * 4 * T.vscale()
+
+    @T.prim_func
+    def desc(c: T.handle) -> None:
+        C = T.match_buffer(c, (SVF2, SVF2), "float32", offset_factor=1)
+        with T.block("root"):
+            T.reads()
+            T.writes(C[0:SVF2, 0:SVF2])
+            for m, n in T.grid(SVF2, SVF2):
+                with T.block("init"):
+                    v_m, v_n = T.axis.remap("SS", [m, n])
+                    C[v_m, v_n] = T.float32(0)
+
+    @T.prim_func
+    def impl(c: T.handle) -> None:
+        C = T.match_buffer(c, (SVF2, SVF2), "float32", offset_factor=1)
+        with T.block("root"):
+            T.reads()
+            T.writes(C[0:SVF2, 0:SVF2])
+            clear_all_tiles = T.int32(255)
+            T.evaluate(
+                T.call_llvm_intrin("void", "llvm.aarch64.sme.zero", 
T.uint32(1), clear_all_tiles)
+            )
+
+    return desc, impl
+
+
 ARM_DOT_4x4_i8_NEON_INTRIN = "dot_4x4_i8i8s32_neon"
 ARM_DOT_4x4_i8_SDOT_INTRIN = "dot_4x4_i8i8s32_sdot"
 ARM_DOT_4x4_u8_UDOT_INTRIN = "dot_4x4_u8u8u32_udot"
 ARM_DOT_4x4_u8_HDOT_INTRIN = "dot_4x4_u8u8i32_hdot"
 
 TensorIntrin.register(ARM_DOT_4x4_i8_NEON_INTRIN, neon_4x4_i8i8i32_desc, 
neon_4x4_i8i8i32_impl)
-
 TensorIntrin.register(ARM_DOT_4x4_i8_SDOT_INTRIN, *get_dotprod_intrin("int8", 
"int32"))
-
 TensorIntrin.register(ARM_DOT_4x4_u8_UDOT_INTRIN, *get_dotprod_intrin("uint8", 
"uint32"))
-
 TensorIntrin.register(ARM_DOT_4x4_u8_HDOT_INTRIN, *get_dotprod_intrin("uint8", 
"int32"))
+
+ARM_SME_INIT = "sme_init"
+ARM_SME_2SVLx2SVL_TRANSPOSE_INTERLEAVE = "sme_2svlx2svl_transpose_interleave"
+ARM_SME_2SVLx2SVL_GEMM_INTERLEAVED_MOPA = "sme_2svlx2svl_gemm_interleaved_mopa"
+
+# The following tensor intrinsics use LLVM intrinsics that are only available
+# in versions of LLVM >= 15. Installations with older versions of LLVM will
+# not be able to use them.
+if llvm_version_major() >= 15:
+    TensorIntrin.register(
+        ARM_SME_2SVLx2SVL_TRANSPOSE_INTERLEAVE, 
*get_sme_transpose_interleave_2svlx2svl_intrin()
+    )
+    TensorIntrin.register(ARM_SME_INIT, *get_sme_init_intrin())
diff --git a/python/tvm/topi/arm_cpu/__init__.py 
b/python/tvm/topi/arm_cpu/__init__.py
index 054103f43b..5484adaa64 100644
--- a/python/tvm/topi/arm_cpu/__init__.py
+++ b/python/tvm/topi/arm_cpu/__init__.py
@@ -22,13 +22,16 @@ from .conv2d import *
 from .depthwise_conv2d import *
 from .conv2d_transpose import *
 from .conv2d_int8 import *
-from . import conv2d_alter_op
 from .bitserial_conv2d import *
 from .bitserial_dense import *
 from .injective import *
 from .group_conv2d import *
 from .pooling import *
 from .dense import *
+from .matmul import *
 from .qnn import *
+
+from . import conv2d_alter_op
+from . import dense_alter_op
 from . import qnn_alter_op
 from . import qnn_legalize
diff --git a/python/tvm/topi/arm_cpu/arm_utils.py 
b/python/tvm/topi/arm_cpu/arm_utils.py
index c350b87167..f2e01c5aef 100644
--- a/python/tvm/topi/arm_cpu/arm_utils.py
+++ b/python/tvm/topi/arm_cpu/arm_utils.py
@@ -19,6 +19,7 @@
 
 import tvm
 from tvm.target import Target
+from tvm.tir.expr import PrimExpr
 
 
 def get_tiling_A(interleave_A, in_dtype):
@@ -186,6 +187,31 @@ def get_conv2d_im2col_padding(M, K, tile_M, tile_K):
     return pad_M, pad_K
 
 
+def pad_dim_to_multiple(dim: PrimExpr, multiple: PrimExpr):
+    """
+    Compute the padding required to reach specified multiple.
+
+    Parameters
+    ----------
+    dim : PrimExpr
+        Current size of the dim.
+    multiple : PrimExpr
+        Multiple to pad up to.
+
+    Returns
+    -------
+    padded_dim : PrimExpr
+        The new dim size.
+    pad_value : PrimExpr
+        The padding required.
+    """
+    pad_value = 0
+    if dim % multiple != 0:
+        pad_value = multiple - (dim % multiple)
+    padded_dim = dim + pad_value
+    return padded_dim, pad_value
+
+
 def get_conv2d_weights_padding(N, K, tile_N, tile_K):
     """Compute the necessary padding for matrix B', where B'
     is the transformed version of matrix B in C=A*B.
diff --git a/python/tvm/topi/arm_cpu/dense.py b/python/tvm/topi/arm_cpu/dense.py
index dd66b0d531..6a44cc89b0 100644
--- a/python/tvm/topi/arm_cpu/dense.py
+++ b/python/tvm/topi/arm_cpu/dense.py
@@ -14,16 +14,18 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-# pylint: disable=invalid-name, unused-variable, no-else-return, 
unused-argument, import-outside-toplevel
 """Dense schedule for ARM CPU"""
-
 from tvm import autotvm
-from .mprofile.dsp.dense import dense_dsp_schedule, dense_dsp_compute
+
+from .mprofile.dsp.dense import (
+    dense_dsp_schedule,
+    dense_dsp_compute,
+)
 
 
 @autotvm.register_topi_compute("dense_dsp.arm_cpu")
 def dense_dsp(cfg, data, weight, bias, out_dtype):
-    """Compute conv2d_nhwc with v7e-m DSP instructions."""
+    """Compute dense_dsp with v7e-m DSP instructions."""
     return dense_dsp_compute(cfg, data, weight, bias=bias, out_dtype=out_dtype)
 
 
diff --git a/python/tvm/topi/arm_cpu/dense_alter_op.py 
b/python/tvm/topi/arm_cpu/dense_alter_op.py
new file mode 100644
index 0000000000..208b923e68
--- /dev/null
+++ b/python/tvm/topi/arm_cpu/dense_alter_op.py
@@ -0,0 +1,75 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Dense alter op definitions for the `arm_cpu` device key."""
+
+import tvm
+from tvm import relay
+from tvm import autotvm
+from tvm import te
+
+from ..nn import dense_alter_layout
+
+
+@dense_alter_layout.register("arm_cpu")
+def _alter_dense(attrs, inputs, tinfos, out_type):
+    target = tvm.target.Target.current(allow_none=False)
+    dispatch_ctx = autotvm.task.DispatchContext.current
+
+    _, outs = relay.backend.te_compiler.select_implementation(
+        relay.op.get("nn.dense"),
+        attrs,
+        tinfos,
+        out_type,
+        target,
+    )
+    workload = autotvm.task.get_workload(outs)
+    if workload is None:
+        # The best implementation is not an AutoTVM template,
+        # we then assume it's not necessary to alter this op.
+        return None
+
+    cfg = dispatch_ctx.query(target, workload)
+    topi_impl = workload[0]
+    if topi_impl == "matmul.arm_cpu.sme":
+        # Pre-compute transposed weights and convert to a matmul
+        assert isinstance(
+            inputs[1], relay.Constant
+        ), "matmul_sme.arm_cpu requires weights be a Relay Constant"
+
+        weight_dtype = tinfos[1].dtype
+        weight_data = inputs[1].data.numpy()
+        interleaved = weight_data.transpose()
+        encoded_weight = relay.const(interleaved, weight_dtype)
+
+        new_weight = te.placeholder((weight_data.shape), dtype=weight_dtype)
+        new_workload = autotvm.task.args_to_workload(
+            [tinfos[0], new_weight, None, out_type.dtype], topi_impl
+        )
+        dispatch_ctx.update(target, new_workload, cfg)
+
+        return relay.nn.matmul(
+            inputs[0],
+            encoded_weight,
+            units=attrs.units,
+            out_dtype=attrs.out_dtype,
+            transpose_a=False,
+            transpose_b=False,
+        )
+
+    # x86 schedules are used as a fallback
+    return tvm.topi.x86.dense_alter_op._alter_dense_layout(attrs, inputs, 
tinfos, out_type)
diff --git a/python/tvm/topi/arm_cpu/matmul.py 
b/python/tvm/topi/arm_cpu/matmul.py
new file mode 100644
index 0000000000..ea8b27cabc
--- /dev/null
+++ b/python/tvm/topi/arm_cpu/matmul.py
@@ -0,0 +1,124 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,unused-argument
+
+"""Matmul schedules for the `arm_cpu` device key."""
+
+import tvm
+from tvm import te
+from tvm import autotvm
+from tvm.script import tir as T
+from tvm.topi import nn
+from tvm.topi.utils import get_const_tuple
+from tvm.topi.arm_cpu.pstate_attributes import SMEAttributes
+from tvm.topi.arm_cpu.arm_utils import pad_dim_to_multiple
+
+
+@autotvm.register_topi_compute("matmul.arm_cpu.sme")
+def compute_matmul_sme(cfg, data_a, data_b, _, out_dtype, transpose_a=False, 
transpose_b=False):
+    """
+    SME Matmul compute definition.
+    """
+    assert (
+        transpose_a == transpose_b == False
+    ), "Compute definition currently does not support transposed inputs."
+
+    M, K = get_const_tuple(data_a.shape)
+    N = get_const_tuple(data_b.shape)[1]
+
+    if not out_dtype:
+        out_dtype = data_a.dtype
+
+    tile_m = 2 * 4 * tvm.tir.vscale()
+    tile_n = 2 * 4 * tvm.tir.vscale()
+
+    M_padded, pad_M = pad_dim_to_multiple(M, tile_m)
+    N_padded, pad_N = pad_dim_to_multiple(N, tile_n)
+    if pad_M != 0:
+        data_a = nn.pad(data_a, pad_before=(0, 0), pad_after=(pad_M, 0))
+    if pad_N != 0:
+        data_b = nn.pad(data_b, pad_before=(0, 0), pad_after=(0, pad_N))
+
+    k = te.reduce_axis((0, K), name="k")
+    C = te.compute(
+        (M_padded, N_padded),
+        lambda m, n: te.sum(
+            data_a[m, k].astype(data_a.dtype) * data_b[k, 
n].astype(data_b.dtype),
+            axis=k,
+        ).astype(out_dtype),
+        name="matmul_sme_gemm",
+    )
+    C = te.compute((M, N), lambda m, n: C[m, n])
+    return C
+
+
+def tir_schedule_matmul_sme(sch):
+    """
+    SME STIR Matmul schedule.
+    """
+    # pylint: disable=import-outside-toplevel
+    from tvm.tir.tensor_intrin.arm_cpu import (
+        ARM_SME_2SVLx2SVL_TRANSPOSE_INTERLEAVE,
+        ARM_SME_2SVLx2SVL_GEMM_INTERLEAVED_MOPA,
+        ARM_SME_INIT,
+        get_sme_gemm_interleaved_mopa_2svlx2svl_intrin,
+    )
+
+    gemm_block = sch.get_block("matmul_sme_gemm")
+    m, n, k = sch.get_loops(gemm_block)
+
+    extent_m = sch.get(m).extent
+    extent_k = sch.get(k).extent
+
+    tile_m = T.cast(2 * 4 * T.vscale(), extent_m.dtype)
+    tile_k = T.cast(2 * 4 * T.vscale(), extent_k.dtype)
+    tile_n = T.cast(2 * 4 * T.vscale(), sch.get(n).extent.dtype)
+
+    # Interleave the input utilizing the matrix tile
+    interleave_a_block = sch.cache_read(gemm_block, 0, "global")
+    sch.transform_layout(interleave_a_block, ("write", 0), lambda m, k: (k, m))
+    m, k = sch.get_loops(interleave_a_block)
+    outer_m, inner_m = sch.split(m, factors=(None, tile_m), 
disable_predication=True)
+    outer_k, inner_k = sch.split(k, factors=(None, tile_k), 
disable_predication=True)
+    sch.reorder(outer_k, outer_m, inner_k, inner_m)
+    sch.tensorize(inner_k, ARM_SME_2SVLx2SVL_TRANSPOSE_INTERLEAVE)
+
+    # Split and reorder the loops of the GeMM for tensorization
+    m, n, k = sch.get_loops(gemm_block)
+    outer_m, inner_m = sch.split(m, factors=(None, tile_m), 
disable_predication=True)
+    outer_n, inner_n = sch.split(n, factors=(None, tile_n), 
disable_predication=True)
+    sch.reorder(outer_m, outer_n, inner_m, inner_n, k)
+
+    # Tensorize the GeMM initialization
+    init_block = sch.decompose_reduction(gemm_block, inner_m)
+    sch.tensorize(sch.get_loops(init_block)[-2], ARM_SME_INIT)
+
+    # Tensorize the GeMM update
+    sme_gemm_interleaved_intrin_name = ARM_SME_2SVLx2SVL_GEMM_INTERLEAVED_MOPA 
+ f"_{extent_k}"
+    tvm.tir.TensorIntrin.register(
+        sme_gemm_interleaved_intrin_name,
+        *get_sme_gemm_interleaved_mopa_2svlx2svl_intrin(extent_k),
+        override=True,
+    )
+    sch.tensorize(inner_m, sme_gemm_interleaved_intrin_name)
+
+    # Add pstate annotations
+    root_block = sch.get_block("root")
+    sch.annotate(
+        root_block, SMEAttributes.STREAMING_MODE, 
SMEAttributes.StreamingModeValues.ENABLED
+    )
+    sch.annotate(root_block, SMEAttributes.ZA_STORAGE, 
SMEAttributes.ZAStorageValues.NEW)
diff --git a/python/tvm/topi/x86/dense_alter_op.py 
b/python/tvm/topi/x86/dense_alter_op.py
index 0e9b1f7b65..10b1248c6a 100644
--- a/python/tvm/topi/x86/dense_alter_op.py
+++ b/python/tvm/topi/x86/dense_alter_op.py
@@ -39,7 +39,7 @@ def check_int8_applicable(x, y, allow_padding=False):
     )
 
 
-@dense_alter_layout.register(["cpu", "arm_cpu"])
+@dense_alter_layout.register(["cpu"])
 def _alter_dense_layout(attrs, inputs, tinfos, out_type):
     target = tvm.target.Target.current(allow_none=False)
     dispatch_ctx = autotvm.task.DispatchContext.current
diff --git a/src/arith/const_int_bound.cc b/src/arith/const_int_bound.cc
index 57dd024a27..76c97c5ad5 100644
--- a/src/arith/const_int_bound.cc
+++ b/src/arith/const_int_bound.cc
@@ -371,7 +371,7 @@ class ConstIntBoundAnalyzer::Impl
     } else if (op->op.same_as(tir::builtin::bitwise_and())) {
       return VisitBitwiseAnd(op);
     } else if (op->op.same_as(tir::builtin::vscale()) && TargetHasSVE()) {
-      return MakeBound(1, 16);
+      return MakeBound(1, kAArch64VScaleValues.size());
     } else {
       return Everything(op->dtype);
     }
diff --git a/src/relay/backend/te_compiler_cache.cc 
b/src/relay/backend/te_compiler_cache.cc
index b747855bff..2655cf6671 100644
--- a/src/relay/backend/te_compiler_cache.cc
+++ b/src/relay/backend/te_compiler_cache.cc
@@ -476,12 +476,10 @@ class ScheduleBuilder : public ExprVisitor {
         
mod_eq_structural_(meta_schedule::ModuleEquality::Create("ignore-ndarray")) {
     // Whether to use auto_scheduler schedule.
     use_auto_scheduler_ = backend::IsAutoSchedulerEnabled();
+    database_ = meta_schedule::Database::Current();
     if (backend::IsMetaScheduleEnabled()) {
-      database_ = meta_schedule::Database::Current();
       CHECK(database_.defined()) << "ValueError: `use_meta_schedule` is 
enabled in Relay "
                                     "build, but no `meta_schedule.Database` 
context is provided. ";
-    } else {
-      database_ = NullOpt;
     }
   }
 
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index 9e2fe63b00..ccc9734855 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -193,6 +193,7 @@ RELAY_REGISTER_OP("nn.matmul")
     .add_argument("tensor_a", "nD Tensor", "The first input Tensor.")
     .add_argument("tensor_b", "2D Tensor", "The second input Tensor.")
     .set_support_level(1)
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", 
DenseInferCorrectLayout)
     .add_type_rel("Matmul", MatmulRel<MatmulAttrs>)
     .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable);
 
diff --git a/src/tir/schedule/ir_comparator.cc 
b/src/tir/schedule/ir_comparator.cc
index 00e573eaf6..a97cda266f 100644
--- a/src/tir/schedule/ir_comparator.cc
+++ b/src/tir/schedule/ir_comparator.cc
@@ -18,6 +18,8 @@
  */
 #include "./ir_comparator.h"
 
+#include "../../arith/scalable_expression.h"
+
 namespace tvm {
 
 namespace tir {
@@ -74,7 +76,9 @@ bool TensorizeComparator::VisitStmt(const Stmt& n, const 
Stmt& other) {
 bool TensorizeComparator::VisitExpr(const PrimExpr& n, const PrimExpr& other) {
   bool equal = n.same_as(other) ||
                ((n->type_index() == other->type_index()) &&
-                n.dtype().code() == other.dtype().code() && 
ExprComparator::VisitExpr(n, other));
+                n.dtype().code() == other.dtype().code() && 
ExprComparator::VisitExpr(n, other)) ||
+               (tvm::arith::ContainsVscaleCall(n) && 
analyzer_.CanProveEqual(n, other));
+
   if (!equal && assert_mode_) {
     std::ostringstream os;
     os << "Expression mismatch: " << n << " vs " << other;
diff --git a/tests/python/codegen/test_target_codegen_aarch64.py 
b/tests/python/codegen/test_target_codegen_aarch64.py
index 9726f79d7a..f73d96e7c9 100644
--- a/tests/python/codegen/test_target_codegen_aarch64.py
+++ b/tests/python/codegen/test_target_codegen_aarch64.py
@@ -15,15 +15,17 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import re
+"""
+Codegen tests for AArch64
+"""
 
+import re
 import pytest
 
 import tvm
 from tvm import te
 from tvm.script import tir as T
 from tvm.topi.arm_cpu.pstate_attributes import SMEAttributes
-
 from tvm.target.codegen import llvm_version_major
 
 
@@ -496,6 +498,46 @@ def test_codegen_vscale():
     assert re.findall(r"llvm.vscale.i32", llvm), "No vscale in generated LLVM."
 
 
+@pytest.mark.skipif(
+    llvm_version_major() < 16, reason="SME is not supported in earlier 
versions of LLVM"
+)
+@pytest.mark.parametrize("dtype", ["float32"])
+def test_matmul_sme(dtype):
+    target = "llvm -mtriple=aarch64-linux-gnu -mattr=+v9a,+sme"
+
+    def check_correct_assembly(dtype):
+        A = te.placeholder((32, 32), dtype=dtype, name="A")
+        B = te.placeholder((32, 32), dtype=dtype, name="B")
+
+        with tvm.target.Target(target):
+            C = tvm.topi.arm_cpu.matmul.compute_matmul_sme(A, B, None, dtype, 
False, False)
+            prim_func = te.create_prim_func([A, B, C])
+
+            sch = tvm.tir.Schedule(prim_func)
+            tvm.topi.arm_cpu.matmul.tir_schedule_matmul_sme(sch)
+            prim_func = sch.mod
+
+            f = tvm.build(prim_func, target=target)
+
+        assembly = f.get_source("asm")
+        smstart = re.findall(r"smstart\t(sm|za)", assembly)
+        loads = re.findall(r"ld1[whdb]\t{\s?za", assembly)
+        mopa = re.findall(
+            r"fmopa\tza[0-9].[shdb],( p[0-9]/[zm],)?( p[0-9]/[zm],)? 
z[0-9].[shdb], z[0-9].[shdb]",
+            assembly,
+        )
+        stores = re.findall(r"st1[whdb]\t{\s?za", assembly)
+        smstop = re.findall(r"smstop\t(sm|za)", assembly)
+
+        assert len(smstart) > 0
+        assert len(loads) > 0
+        assert len(mopa) > 0
+        assert len(stores) > 0
+        assert len(smstop) > 0
+
+    check_correct_assembly(dtype=dtype)
+
+
 @pytest.mark.skipif(
     llvm_version_major() < 11, reason="Vscale is not supported in earlier 
versions of LLVM"
 )
diff --git a/tests/python/integration/test_arm_aprofile.py 
b/tests/python/integration/test_arm_aprofile.py
index af35a14297..d32fed00af 100644
--- a/tests/python/integration/test_arm_aprofile.py
+++ b/tests/python/integration/test_arm_aprofile.py
@@ -16,7 +16,6 @@
 # under the License.
 """Tests for Arm(R) A-Profile Architecture."""
 import os
-import subprocess
 
 import numpy as np
 import pytest
@@ -26,8 +25,6 @@ import tvm.testing
 from tvm import relay
 from tvm.relay.transform import ToMixedPrecision, FoldConstant
 from tvm.relay.build_module import bind_params_by_name
-from tvm.testing.aot import AOTTestModel, AOTTestRunner, generate_ref_data, 
compile_and_run
-from tvm.contrib import utils
 
 
 def get_mattr(dtype):
@@ -80,96 +77,5 @@ def test_conv2d(dtype):
         lib.export_library(lib_path, cc="aarch64-linux-gnu-gcc")
 
 
-# AOT Test Runner using the AArch64 Architecture Envelope Model (AEM)
-# Fixed Virtual Platform (FVP) reference system.
-# See: 
https://developer.arm.com/Tools%20and%20Software/Fixed%20Virtual%20Platforms
-AOT_APROFILE_AEM_RUNNER = AOTTestRunner(
-    makefile="aprofile_aem",
-    pass_config={
-        "tir.usmp.enable": False,
-        "tir.disable_assert": True,  # AOT test infra creates 'fake' inputs 
that fail asserts
-    },
-)
-
-
-@tvm.testing.requires_x86
-@tvm.testing.skip_if_32bit
-def test_aem_simple_addition():
-    """Tests a simple addition running on the AArch64 AEM."""
-    inp = relay.var("data", shape=(1, 2, 4, 4))
-    add = relay.add(inp, relay.const(np.ones((1, 2, 4, 4))))
-    func = relay.Function([inp], add)
-    ir_mod = tvm.IRModule.from_expr(func)
-    ir_mod = tvm.relay.transform.InferType()(ir_mod)
-
-    main_func = ir_mod["main"]
-    shape_dict = {p.name_hint: p.checked_type.concrete_shape for p in 
main_func.params}
-    type_dict = {p.name_hint: p.checked_type.dtype for p in main_func.params}
-
-    input_data = 
np.random.uniform(size=shape_dict["data"]).astype(type_dict["data"])
-    params = {}
-    inputs = {"data": input_data}
-    ref_outputs = generate_ref_data(ir_mod, inputs, params)
-
-    compile_and_run(
-        AOTTestModel(module=ir_mod, inputs=inputs, outputs=ref_outputs, 
params=params),
-        target=tvm.target.Target("llvm -mtriple=aarch64-none-elf"),
-        runtime=tvm.relay.backend.Runtime("crt", {"system-lib": True}),
-        interface_api="packed",
-        use_unpacked_api=False,
-        runner=AOT_APROFILE_AEM_RUNNER,
-    )
-
-
-@tvm.testing.requires_x86
-@tvm.testing.skip_if_32bit
-def test_aem_asm_sme():
-    """
-    Tests SME assembly runs on the AArch64 AEM. This test is used as a simple
-    sanity check until the TVM schedules are able to produce SME.
-    """
-    c_code = """
-    #include <stdio.h>
-
-    int main(void) {
-        __asm volatile(
-            "smstart\\n"
-            "smstop\\n"
-        );
-        printf("EXITTHESIM\\n");
-        return 0;
-    }
-    """
-    runner = AOT_APROFILE_AEM_RUNNER
-
-    tmpdir = utils.tempdir()
-    build_path = os.path.join(tmpdir.path, "build")
-    os.makedirs(build_path, exist_ok=True)
-
-    with open(build_path + "/test.c", "w") as f:
-        f.write(c_code)
-
-    file_dir = os.path.dirname(os.path.abspath(__file__))
-    makefile_dir = os.path.join(file_dir, "../../../tests/python/relay/aot")
-    makefile = os.path.join(makefile_dir, f"{runner.makefile}.mk")
-
-    make_command = (
-        f"make -f {makefile} build_dir={build_path}"
-        + f" TVM_ROOT={file_dir}/../../.."
-        + f" AOT_TEST_ROOT={makefile_dir}"
-        + " FVP_DIR=/opt/arm/fvp/Base_RevC_AEMvA_pkg/models/Linux64_GCC-9.3/"
-    )
-
-    compile_command = f"{make_command} aot_test_runner"
-    popen = subprocess.Popen(compile_command, cwd=build_path, shell=True, 
stdout=subprocess.PIPE)
-    return_code = popen.wait()
-    assert not return_code, "Failed to compile"
-
-    run_command = f"{make_command} run"
-    popen = subprocess.Popen(run_command, cwd=build_path, shell=True, 
stdout=subprocess.PIPE)
-    return_code = popen.wait()
-    assert not return_code, "Failed to run"
-
-
 if __name__ == "__main__":
     tvm.testing.main()
diff --git 
a/tests/python/meta_schedule/test_meta_schedule_postproc_rewrite_tensorize.py 
b/tests/python/meta_schedule/test_meta_schedule_postproc_rewrite_tensorize.py
index 8cc1c7c7aa..1272b35451 100644
--- 
a/tests/python/meta_schedule/test_meta_schedule_postproc_rewrite_tensorize.py
+++ 
b/tests/python/meta_schedule/test_meta_schedule_postproc_rewrite_tensorize.py
@@ -18,7 +18,7 @@
 import tvm
 from tvm import meta_schedule as ms
 from tvm.script import tir as T
-from tvm.tir.tensor_intrin import arm_cpu, cuda, rocm, x86
+from tvm.tir.tensor_intrin import cuda, rocm, x86
 
 
 @tvm.script.ir_module
diff --git a/tests/python/relay/strategy/arm_cpu/scalable_utils.py 
b/tests/python/relay/strategy/arm_cpu/scalable_utils.py
new file mode 100644
index 0000000000..ad16a47612
--- /dev/null
+++ b/tests/python/relay/strategy/arm_cpu/scalable_utils.py
@@ -0,0 +1,53 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+from tvm.tir.stmt_functor import post_order_visit, ir_transform
+
+
+def calculate_extra_workspace_size_from_scalable_extents(func, 
known_vscale_value):
+    """
+    The AOT executor needs to know the size of the workspace ahead of time, 
but this
+    isn't possible when some allocations are scalable (vscale is not known at 
compile-time).
+    If we know the target hardware, we can reason about the value of vscale 
ahead of time.
+    This function will calculate an upper-bound for the extra workspace bytes 
required by the
+    AOT executor given TIR function and a known value for vscale.
+    """
+    extra_workspace_bytes = 0
+    is_scalable_extent = False
+    ana = tvm.arith.Analyzer()
+
+    def replace_vscale_with_known_value(stmt):
+        nonlocal is_scalable_extent
+        if isinstance(stmt, tvm.tir.expr.Call) and stmt.op.name == 
"tir.vscale":
+            is_scalable_extent = True
+            return tvm.tir.IntImm(stmt.dtype, known_vscale_value)
+
+    def calculate_workspace_bytes(stmt):
+        nonlocal extra_workspace_bytes, is_scalable_extent
+        if isinstance(stmt, tvm.tir.stmt.Allocate):
+            for extent in stmt.extents:
+                extent_stmt = tvm.tir.Evaluate(extent)
+                is_scalable_extent = False
+                mutated_extent = ir_transform(extent_stmt, 
replace_vscale_with_known_value, None)
+                # Non scalable extents are already included in the calculation 
by AOT
+                if is_scalable_extent:
+                    alloc_bytes = ana.simplify(mutated_extent.value) * 
tvm.DataType(stmt.dtype).bits
+                    extra_workspace_bytes += alloc_bytes
+
+    post_order_visit(func.body, calculate_workspace_bytes)
+    return extra_workspace_bytes
diff --git a/tests/python/relay/strategy/arm_cpu/test_dense_dsp.py 
b/tests/python/relay/strategy/arm_cpu/test_dense.py
similarity index 50%
rename from tests/python/relay/strategy/arm_cpu/test_dense_dsp.py
rename to tests/python/relay/strategy/arm_cpu/test_dense.py
index abd3ac4a3f..b9384e532e 100644
--- a/tests/python/relay/strategy/arm_cpu/test_dense_dsp.py
+++ b/tests/python/relay/strategy/arm_cpu/test_dense.py
@@ -14,14 +14,24 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import pytest
 import numpy as np
+
 import tvm
 import tvm.testing
 from tvm import relay
-from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
-from tvm.micro.testing.aot_test_utils import (
-    AOT_CORSTONE300_RUNNER,
+from tvm import meta_schedule
+from tvm.testing.aot import (
+    AOTTestModel,
+    AOTCompiledTestModel,
+    compile_and_run,
+    run_and_check,
+    generate_ref_data,
 )
+from tvm.micro.testing.aot_test_utils import AOT_CORSTONE300_RUNNER, 
AOT_APROFILE_AEM_RUNNER
+from tvm.target.codegen import llvm_version_major
+from tvm.relay.op.strategy.arm_cpu import arm_cpu_tir_strategy
+from scalable_utils import calculate_extra_workspace_size_from_scalable_extents
 
 
 class BasicDenseTests:
@@ -84,5 +94,80 @@ class TestDense(BasicDenseTests):
     enable_bias = tvm.testing.parameter(False, True)
 
 
+@pytest.mark.skipif(
+    llvm_version_major() < 17, reason="SME is not supported in earlier 
versions of LLVM"
+)
+@tvm.testing.requires_aprofile_aem_fvp
+@pytest.mark.parametrize(
+    "data_shape,weight_shape",
+    [
+        ((32, 32), (32, 32)),
+        ((2, 35), (6, 35)),
+        ((3, 3), (68, 3)),
+        ((79, 65), (152, 65)),
+    ],
+)
+@pytest.mark.parametrize("dtype", ["float32"])
+def test_sme_dense(data_shape, weight_shape, dtype):
+    np.random.seed(0)
+
+    input_data = np.random.uniform(size=data_shape).astype(dtype)
+    inp = relay.var("data", shape=data_shape, dtype=dtype)
+    weight_data = np.random.uniform(size=weight_shape).astype(dtype)
+    weight = relay.const(weight_data, dtype=dtype)
+
+    dense = relay.nn.dense(inp, weight)
+    func = relay.Function(relay.analysis.free_vars(dense), dense)
+
+    ir_mod = tvm.IRModule.from_expr(func)
+    ir_mod = tvm.relay.transform.InferType()(ir_mod)
+
+    inputs = {"data": input_data}
+    params = {}
+    ref_outputs = generate_ref_data(ir_mod, inputs, params)
+
+    target = tvm.target.Target("llvm -mtriple=aarch64-none-elf 
-mattr=+v9.2a,+sme")
+    runtime = tvm.relay.backend.Runtime("crt", {"system-lib": True})
+    executor = tvm.relay.backend.Executor(
+        "aot",
+        {
+            "interface-api": "packed",
+            "unpacked-api": False,
+        },
+    )
+
+    with tvm.transform.PassContext(
+        opt_level=3, config=AOT_APROFILE_AEM_RUNNER.pass_config
+    ), meta_schedule.database.ScheduleFnDatabase(arm_cpu_tir_strategy):
+        executor_factory = tvm.relay.build(
+            ir_mod,
+            target=target,
+            executor=executor,
+            runtime=runtime,
+            params=params,
+        )
+    generated_func = executor_factory.lowered_ir_mods.items()[0][1][
+        "tvmgen_default_fused_nn_matmul"
+    ]
+    extra_memory_in_bytes = 
calculate_extra_workspace_size_from_scalable_extents(generated_func, 4)
+
+    test_model = AOTTestModel(
+        ir_mod, inputs, ref_outputs, params=params, 
extra_memory_in_bytes=extra_memory_in_bytes
+    )
+    compiled = AOTCompiledTestModel(test_model, executor_factory)
+
+    assembly = (
+        
compiled.executor_factory.module.imported_modules[0].imported_modules[0].get_source("asm")
+    )
+    assert "fmopa" in assembly
+
+    assert run_and_check(
+        models=[compiled],
+        interface_api="packed",
+        runner=AOT_APROFILE_AEM_RUNNER,
+        print_output_on_mismatch=True,
+    )
+
+
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/relay/strategy/arm_cpu/test_matmul.py 
b/tests/python/relay/strategy/arm_cpu/test_matmul.py
new file mode 100644
index 0000000000..3b46c8019a
--- /dev/null
+++ b/tests/python/relay/strategy/arm_cpu/test_matmul.py
@@ -0,0 +1,118 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pytest
+import numpy as np
+
+import tvm
+from tvm import relay
+from tvm import meta_schedule
+from tvm.testing.aot import (
+    AOTTestModel,
+    AOTCompiledTestModel,
+    run_and_check,
+    generate_ref_data,
+)
+from tvm.micro.testing.aot_test_utils import AOT_APROFILE_AEM_RUNNER
+from tvm.target.codegen import llvm_version_major
+from tvm.relay.op.strategy.arm_cpu import arm_cpu_tir_strategy
+from scalable_utils import calculate_extra_workspace_size_from_scalable_extents
+
+
+@pytest.mark.skipif(
+    llvm_version_major() < 17, reason="SME is not supported in earlier 
versions of LLVM"
+)
+@tvm.testing.requires_aprofile_aem_fvp
+@pytest.mark.parametrize(
+    "data_shape,weight_shape,transpose_a,transpose_b",
+    [
+        ((4, 63), (63, 10), False, False),
+        ((64, 32), (32, 32), False, True),
+        ((96, 64), (64, 32), False, False),
+        ((62, 3), (3, 3), False, False),
+        ((4, 5), (79, 5), False, True),
+        ((134, 36), (36, 111), False, False),
+        ((3, 10), (10, 72), False, False),
+        # Tensorization does not work when the reduction axis has unit iters.
+        # See https://github.com/apache/tvm/issues/16566
+        # ((5, 1), (1, 5), False, False),
+    ],
+)
+@pytest.mark.parametrize("dtype", ["float32"])
+def test_sme_matmul_with_const_b(data_shape, weight_shape, transpose_a, 
transpose_b, dtype):
+    """
+    Execution tests for matmul Scalable Matrix Extension (SME) schedule.
+    """
+    np.random.seed(0)
+
+    input_data = np.random.uniform(size=data_shape).astype(dtype)
+    inp = relay.var("data", shape=data_shape, dtype=dtype)
+    weight_data = np.random.uniform(size=weight_shape).astype(dtype)
+    weight = relay.const(weight_data, dtype=dtype)
+
+    matmul = relay.nn.matmul(inp, weight, transpose_a=transpose_a, 
transpose_b=transpose_b)
+    func = relay.Function(relay.analysis.free_vars(matmul), matmul)
+
+    ir_mod = tvm.IRModule.from_expr(func)
+    ir_mod = tvm.relay.transform.InferType()(ir_mod)
+
+    inputs = {"data": input_data}
+    params = {}
+    ref_outputs = generate_ref_data(ir_mod, inputs, params)
+
+    target = tvm.target.Target("llvm -mtriple=aarch64-none-elf 
-mattr=+v9.2a,+sme")
+    runtime = tvm.relay.backend.Runtime("crt", {"system-lib": True})
+    executor = tvm.relay.backend.Executor(
+        "aot",
+        {
+            "interface-api": "packed",
+            "unpacked-api": False,
+        },
+    )
+    with tvm.transform.PassContext(
+        opt_level=3, config=AOT_APROFILE_AEM_RUNNER.pass_config
+    ), meta_schedule.database.ScheduleFnDatabase(arm_cpu_tir_strategy):
+        executor_factory = tvm.relay.build(
+            ir_mod,
+            target=target,
+            executor=executor,
+            runtime=runtime,
+            params=params,
+        )
+    generated_func = executor_factory.lowered_ir_mods.items()[0][1][
+        "tvmgen_default_fused_nn_matmul"
+    ]
+    extra_memory_in_bytes = 
calculate_extra_workspace_size_from_scalable_extents(generated_func, 4)
+
+    test_model = AOTTestModel(
+        ir_mod, inputs, ref_outputs, params=params, 
extra_memory_in_bytes=extra_memory_in_bytes
+    )
+    compiled = AOTCompiledTestModel(test_model, executor_factory)
+
+    assembly = 
executor_factory.module.imported_modules[0].imported_modules[0].get_source("asm")
+    assert "fmopa" in assembly
+
+    assert run_and_check(
+        models=[compiled],
+        interface_api="packed",
+        runner=AOT_APROFILE_AEM_RUNNER,
+        print_output_on_mismatch=True,
+    )
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/relay/strategy/test_select_implementation.py 
b/tests/python/relay/strategy/test_select_implementation.py
index d0767175d3..71dd688e29 100644
--- a/tests/python/relay/strategy/test_select_implementation.py
+++ b/tests/python/relay/strategy/test_select_implementation.py
@@ -258,18 +258,23 @@ def test_int8_depthwise_conv2d(target, expected_impl):
 
 @pytest.mark.parametrize(
     "target,expected_valid_impl,expected_impl",
-    [("llvm -device=arm_cpu", ["dense_pack.x86", "dense_nopack.x86"], 
"dense_pack.x86")],
+    [
+        (
+            "llvm -device=arm_cpu",
+            ["dense_pack.x86", "dense_nopack.x86"],
+            "dense_pack.x86",
+        ),
+    ],
 )
 def test_dense(target, expected_valid_impl, expected_impl):
     target = tvm.target.Target(target)
-
     data_shape = (30, 40)
     weight_shape = (30, 40)
     dtype = "float32"
 
     out = relay.nn.dense(
         relay.var("data", shape=data_shape, dtype=dtype),
-        relay.var("weight", shape=weight_shape, dtype=dtype),
+        relay.const(np.zeros((weight_shape)).astype(dtype)),
         out_dtype=dtype,
     )
     out = run_infer_type(out)
@@ -284,7 +289,51 @@ def test_dense(target, expected_valid_impl, expected_impl):
         ]
         valid_impl = relay.backend.te_compiler.get_valid_implementations(*args)
         selected_impl, _ = 
relay.backend.te_compiler.select_implementation(*args, use_autotvm=False)
+    assert len(valid_impl) == len(expected_valid_impl)
+    for impl in valid_impl:
+        assert impl.name in expected_valid_impl
+    assert selected_impl.name == expected_impl
 
+
+@pytest.mark.skipif(llvm_version_major() < 15, reason="Older versions of LLVM 
don't support SME.")
+@pytest.mark.parametrize(
+    "shape,expected_valid_impl,expected_impl",
+    [
+        (
+            (30, 40),
+            ["matmul.arm_cpu.sme", "dense_pack.x86", "dense_nopack.x86"],
+            "matmul.arm_cpu.sme",
+        ),
+        (
+            (5, 1),
+            ["dense_pack.x86", "dense_nopack.x86"],
+            "dense_pack.x86",
+        ),
+    ],
+)
+def test_dense_with_sme_target(shape, expected_valid_impl, expected_impl):
+    target = tvm.target.Target("llvm -mtriple=aarch64-linux-gnu 
-mattr=+v9.2a,+sme")
+    data_shape = shape
+    weight_shape = shape
+    dtype = "float32"
+
+    out = relay.nn.dense(
+        relay.var("data", shape=data_shape, dtype=dtype),
+        relay.const(np.zeros((weight_shape)).astype(dtype)),
+        out_dtype=dtype,
+    )
+    out = run_infer_type(out)
+
+    with target:
+        args = [
+            out.op,
+            out.attrs,
+            [te.placeholder(data_shape, dtype), te.placeholder(weight_shape, 
dtype)],
+            out.checked_type,
+            target,
+        ]
+        valid_impl = relay.backend.te_compiler.get_valid_implementations(*args)
+        selected_impl, _ = 
relay.backend.te_compiler.select_implementation(*args, use_autotvm=False)
     assert len(valid_impl) == len(expected_valid_impl)
     for impl in valid_impl:
         assert impl.name in expected_valid_impl
diff --git a/tests/python/relay/test_pass_alter_op_layout.py 
b/tests/python/relay/test_pass_alter_op_layout.py
index 831070299f..f74b31157a 100644
--- a/tests/python/relay/test_pass_alter_op_layout.py
+++ b/tests/python/relay/test_pass_alter_op_layout.py
@@ -23,6 +23,7 @@ from tvm import relay, topi
 from tvm.relay import transform, analysis
 from tvm.relay.testing.temp_op_attr import TempOpAttr
 from tvm.relay.testing import run_infer_type
+from tvm.target.codegen import llvm_version_major
 import numpy as np
 import tvm.testing
 from tvm.relay import testing
@@ -1451,6 +1452,61 @@ def test_alter_op_dense_packed_data():
             assert tvm.ir.structural_equal(a, b)
 
 
+@pytest.mark.skipif(
+    llvm_version_major() < 17, reason="SME is not supported in earlier 
versions of LLVM"
+)
+def test_alter_op_dense_arm_cpu_sme():
+    np.random.seed(0)
+    y_data = np.random.uniform(size=(64, 32)).astype("float32")
+
+    def before():
+        x = relay.var("x", shape=(32, 32), dtype="float32")
+        y = relay.const(y_data, dtype="float32")
+        dense = relay.nn.dense(x, y)
+        return relay.Function(analysis.free_vars(dense), dense)
+
+    def expected():
+        x = relay.var("x", shape=(32, 32), dtype="float32")
+        y = relay.const(y_data.transpose(), dtype="float32")
+        matmul = relay.nn.matmul(x, y)
+        return relay.Function(analysis.free_vars(matmul), matmul)
+
+    with tvm.target.Target("llvm -mtriple=aarch64-linux-gnu 
-mattr=+v9.2a,+sme"):
+        with TempOpAttr("nn.dense", "FTVMAlterOpLayout", 
topi.arm_cpu.dense_alter_op._alter_dense):
+            a = run_opt_pass(before(), transform.AlterOpLayout())
+            b = run_opt_pass(expected(), transform.InferType())
+            assert tvm.ir.structural_equal(a, b)
+
+
+@pytest.mark.skipif(
+    llvm_version_major() < 17, reason="SME is not supported in earlier 
versions of LLVM"
+)
+@pytest.mark.parametrize(
+    "transpose_b,transform_b", [(False, lambda x: x), (True, lambda x: 
x.transpose())]
+)
+def test_alter_op_matmul_arm_cpu_sme(transpose_b, transform_b):
+    np.random.seed(0)
+    y_data = np.random.uniform(size=(64, 32)).astype("float32")
+
+    def before():
+        x = relay.var("x", shape=(96, 32), dtype="float32")
+        y = relay.const(y_data, dtype="float32")
+        dense = relay.nn.matmul(x, y, transpose_a=False, 
transpose_b=transpose_b)
+        return relay.Function(analysis.free_vars(dense), dense)
+
+    def expected():
+        x = relay.var("x", shape=(96, 32), dtype="float32")
+        y = relay.const(transform_b(y_data), dtype="float32")
+        matmul = relay.nn.matmul(x, y)
+        return relay.Function(analysis.free_vars(matmul), matmul)
+
+    with tvm.target.Target("llvm -mtriple=aarch64-linux-gnu 
-mattr=+v9.2a,+sme"):
+        with TempOpAttr("nn.dense", "FTVMAlterOpLayout", 
topi.arm_cpu.dense_alter_op._alter_dense):
+            a = run_opt_pass(before(), transform.AlterOpLayout())
+            b = run_opt_pass(expected(), transform.InferType())
+            assert tvm.ir.structural_equal(a, b)
+
+
 def test_conv2d_strided_slice_packed_to_unpacked():
     """We do not support propagating through packed to unpacked layout"""
     x_shape = (1, 1, 1, 1, 4)
diff --git a/tests/python/topi/test_topi_matmul.py 
b/tests/python/topi/test_topi_matmul.py
index 4b05dd3813..a7b3965aee 100644
--- a/tests/python/topi/test_topi_matmul.py
+++ b/tests/python/topi/test_topi_matmul.py
@@ -14,12 +14,16 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+
+import pytest
 import numpy as np
+
 import tvm
 import tvm.testing
 from tvm import te
 from tvm import topi
 from tvm.topi.utils import get_const_tuple
+from tvm.topi.arm_cpu.matmul import compute_matmul_sme
 
 
 def with_tvm(lam, *args):
@@ -148,7 +152,17 @@ def test_tensordot():
     verify_tensordot((4, 3, 2, 2), (2, 4, 3, 5), ((1, 2, 0), (2, 0, 1)))
 
 
+@pytest.mark.parametrize("transpose_a,transpose_b", [(True, False), (False, 
True)])
+def test_unsupported_sme_matmul_compute_transpose(transpose_a, transpose_b):
+    """
+    SME matmul compute does not support transposed inputs for now.
+    """
+    err_msg = "Compute definition currently does not support transposed 
inputs."
+    with pytest.raises(AssertionError, match=err_msg) as e:
+        compute_matmul_sme(
+            te.placeholder((32, 32)), te.placeholder((32, 32)), None, None, 
transpose_a, transpose_b
+        )
+
+
 if __name__ == "__main__":
-    test_nn_matmul()
-    test_matmul()
-    test_tensordot()
+    tvm.testing.main()

(tvm) branch main updated: [SME] Introduce scalable fp32 dense schedule (#16921)

Reply via email to