[llvm-branch-commits] [mlir] [mlir] Implement indexed access op interfaces for memref, vector, gpu, nvgpu (PR #177014)

Krzysztof Drewniak via llvm-branch-commits Wed, 21 Jan 2026 10:54:09 -0800

https://github.com/krzysz00 updated 
https://github.com/llvm/llvm-project/pull/177014


>From 35b234f4e824be83acc25e797e1a00afee4c2ccd Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <[email protected]>
Date: Tue, 20 Jan 2026 17:56:01 +0000
Subject: [PATCH] [mlir] Implement indexed access op interfaces for memref,
 vector, gpu, nvgpu

This commit implements the IndexedAccessOpInterface and
IndexedMemCopyInterface for all operations in the memref and vector
dialects that it would appear to apply to. It follows the code in
FoldMemRefAliasOps and ExtractAddressComputations to define the
interface implementations. This commit also adds the interface to the
GPU subgroup MMA load and store operations and to any NVGPU operations
currently being handled by the in-memref transformations (there may be
more suitable operations in the NVGPU dialect, but I haven't gone
looking systematically)

This code will be tested by a later commit that updates
fold-memref-alias-ops.

Assisted-by: Claude Code, Cursor (interface boilerplate, sketching out
implementations)
---
 mlir/include/mlir/Dialect/GPU/IR/GPUOps.td    |  10 ++
 .../Transforms/IndexedAccessOpInterfaceImpl.h |  21 +++
 mlir/include/mlir/Dialect/MemRef/IR/MemRef.h  |   1 +
 .../mlir/Dialect/MemRef/IR/MemRefOps.td       |  33 +++-
 .../include/mlir/Dialect/NVGPU/IR/NVGPUOps.td | 130 +++++++-------
 .../Transforms/MemoryAccessOpInterfacesImpl.h |  21 +++
 .../mlir/Dialect/Vector/IR/VectorOps.td       |  32 +++-
 .../Transforms/IndexedAccessOpInterfaceImpl.h |  21 +++
 mlir/lib/Dialect/GPU/CMakeLists.txt           |   1 +
 mlir/lib/Dialect/GPU/IR/GPUDialect.cpp        |   3 +
 .../IndexedAccessOpInterfaceImpl.cpp          | 115 +++++++++++++
 mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp      |  69 ++++++++
 mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp    |   4 +
 .../Dialect/NVGPU/Transforms/CMakeLists.txt   |   1 +
 .../MemoryAccessOpInterfacesImpl.cpp          |  90 ++++++++++
 mlir/lib/Dialect/Vector/IR/VectorOps.cpp      |   4 +
 .../Dialect/Vector/Transforms/CMakeLists.txt  |   1 +
 .../IndexedAccessOpInterfaceImpl.cpp          | 162 ++++++++++++++++++
 mlir/lib/RegisterAllDialects.cpp              |   6 +
 19 files changed, 653 insertions(+), 72 deletions(-)
 create mode 100644 
mlir/include/mlir/Dialect/GPU/Transforms/IndexedAccessOpInterfaceImpl.h
 create mode 100644 
mlir/include/mlir/Dialect/NVGPU/Transforms/MemoryAccessOpInterfacesImpl.h
 create mode 100644 
mlir/include/mlir/Dialect/Vector/Transforms/IndexedAccessOpInterfaceImpl.h
 create mode 100644 
mlir/lib/Dialect/GPU/Transforms/IndexedAccessOpInterfaceImpl.cpp
 create mode 100644 
mlir/lib/Dialect/NVGPU/Transforms/MemoryAccessOpInterfacesImpl.cpp
 create mode 100644 
mlir/lib/Dialect/Vector/Transforms/IndexedAccessOpInterfaceImpl.cpp

diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td 
b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 51565aed92922..fae102c4cf41d 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -1827,6 +1827,9 @@ def GPU_SubgroupMmaLoadMatrixOp : 
GPU_Op<"subgroup_mma_load_matrix",
     matrix which eventually allows the lowering to determine the size of each
     row.  If the `transpose` attribute is present then the op does a 
transposed load.
 
+    The memory indices along each dimension must be in-bounds for that 
dimension
+    as with an ordinary `memref.load`.
+
     For integer types, the resulting `!gpu.mma_matrix` type needs to specify 
the
     signedness of the data if the matrix type is an `A` or `B` operand for
     `gpu.subgroup_mma_compute`.
@@ -1840,6 +1843,8 @@ def GPU_SubgroupMmaLoadMatrixOp : 
GPU_Op<"subgroup_mma_load_matrix",
      %0 = gpu.subgroup_mma_load_matrix src[%i,%j] : {leadDimension = 32 : i32}
           : memref<32x32xf16, 3>, !gpu.mma_matrix<16x16xf16, "AOp">
     ```
+
+    Implements IndexedAccessOpInterface.
   }];
 
   let arguments = (ins Arg<GPU_MMAMemRef, "",
@@ -1875,12 +1880,17 @@ def GPU_SubgroupMmaStoreMatrixOp : 
GPU_Op<"subgroup_mma_store_matrix",
     This op is often meant to be used along with 
`gpu.subgroup_mma_load_matrix` and
     `gpu.subgroup_mma_compute`.
 
+    The memory indices along each dimension must be in-bounds for that 
dimension
+    as with an ordinary `memref.load`.
+
     Example:
 
     ```mlir
     gpu.subgroup_mma_store_matrix %D, %sg[%i,%j] : { leadDimension = 32 : i32}
                     : !gpu.mma_matrix<16x16xf16, "COp">, memref<32x32xf16, 3>
     ```
+
+    Implements IndexedAccessOpInterface.
   }];
 
   let arguments = (ins Arg<MMAMatrixOf<[SI8, UI8, I32, F16, F32, F64]>>:$src,
diff --git 
a/mlir/include/mlir/Dialect/GPU/Transforms/IndexedAccessOpInterfaceImpl.h 
b/mlir/include/mlir/Dialect/GPU/Transforms/IndexedAccessOpInterfaceImpl.h
new file mode 100644
index 0000000000000..d8a56545fd115
--- /dev/null
+++ b/mlir/include/mlir/Dialect/GPU/Transforms/IndexedAccessOpInterfaceImpl.h
@@ -0,0 +1,21 @@
+//===- IndexedAccessOpInterfaceImpl.h - 
-----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_GPU_TRANSFORMS_INDEXEDACCESSOPINTERFACEIMPL_H
+#define MLIR_DIALECT_GPU_TRANSFORMS_INDEXEDACCESSOPINTERFACEIMPL_H
+
+namespace mlir {
+
+class DialectRegistry;
+
+namespace gpu {
+void registerIndexedAccessOpInterfaceExternalModels(DialectRegistry &registry);
+} // namespace gpu
+} // namespace mlir
+
+#endif // MLIR_DIALECT_GPU_TRANSFORMS_INDEXEDACCESSOPINTERFACEIMPL_H
diff --git a/mlir/include/mlir/Dialect/MemRef/IR/MemRef.h 
b/mlir/include/mlir/Dialect/MemRef/IR/MemRef.h
index b7abcdea10a2a..8653eca0072b6 100644
--- a/mlir/include/mlir/Dialect/MemRef/IR/MemRef.h
+++ b/mlir/include/mlir/Dialect/MemRef/IR/MemRef.h
@@ -11,6 +11,7 @@
 
 #include "mlir/Bytecode/BytecodeOpInterface.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/MemRef/IR/MemoryAccessOpInterfaces.h"
 #include "mlir/Dialect/Utils/ReshapeOpsUtils.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/Interfaces/AlignmentAttrInterface.h"
diff --git a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td 
b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
index 6f8f1481725fc..af8c3c6465964 100644
--- a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
+++ b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
@@ -11,6 +11,7 @@
 
 include "mlir/Dialect/Arith/IR/ArithBase.td"
 include "mlir/Dialect/MemRef/IR/MemRefBase.td"
+include "mlir/Dialect/MemRef/IR/MemoryAccessOpInterfaces.td"
 include "mlir/Interfaces/AlignmentAttrInterface.td"
 include "mlir/Interfaces/CastInterfaces.td"
 include "mlir/Interfaces/ControlFlowInterfaces.td"
@@ -699,7 +700,8 @@ def MemRef_DimOp : MemRef_Op<"dim", [
 // DmaStartOp
 
//===----------------------------------------------------------------------===//
 
-def MemRef_DmaStartOp : MemRef_Op<"dma_start"> {
+def MemRef_DmaStartOp : MemRef_Op<"dma_start", [
+    IndexedMemCopyOpInterface]> {
   let summary = "non-blocking DMA operation that starts a transfer";
   let description = [{
     Syntax:
@@ -778,6 +780,10 @@ def MemRef_DmaStartOp : MemRef_Op<"dma_start"> {
       return {(*this)->operand_begin() + 1,
               (*this)->operand_begin() + 1 + getSrcMemRefRank()};
     }
+    // Alias to getSrcMemRef() for uniformity with other DMA-like ops.
+    ::mlir::TypedValue<::mlir::MemRefType> getSrc() {
+      return 
::llvm::cast<::mlir::TypedValue<::mlir::MemRefType>>(getSrcMemRef());
+    }
 
     // Returns the destination MemRefType for this DMA operations.
     Value getDstMemRef() { return getOperand(1 + getSrcMemRefRank()); }
@@ -786,6 +792,11 @@ def MemRef_DmaStartOp : MemRef_Op<"dma_start"> {
     unsigned getDstMemRefRank() {
       return ::llvm::cast<MemRefType>(getDstMemRef().getType()).getRank();
     }
+    // Alias to getDstMemRef() for uniformity with other DMA-like ops.
+    ::mlir::TypedValue<::mlir::MemRefType> getDst() {
+      return 
::llvm::cast<::mlir::TypedValue<::mlir::MemRefType>>(getDstMemRef());
+    }
+
     unsigned getSrcMemorySpace() {
       return 
::llvm::cast<MemRefType>(getSrcMemRef().getType()).getMemorySpaceAsInt();
     }
@@ -875,6 +886,10 @@ def MemRef_DmaStartOp : MemRef_Op<"dma_start"> {
       effects.emplace_back(MemoryEffects::Read::get(), &getTagMemRefMutable(),
                            SideEffects::DefaultResource::get());
     }
+
+    void setMemrefsAndIndices(RewriterBase& rewriter,
+      Value newSrc, ValueRange newSrcIndices,
+      Value newDst, ValueRange newDstIndices);
   }];
   let hasCustomAssemblyFormat = 1;
   let hasFolder = 1;
@@ -1066,7 +1081,8 @@ def GenericAtomicRMWOp : MemRef_Op<"generic_atomic_rmw", [
       SingleBlockImplicitTerminator<"AtomicYieldOp">,
       TypesMatchWith<"result type matches element type of memref",
                      "memref", "result",
-                     "::llvm::cast<MemRefType>($_self).getElementType()">
+                     "::llvm::cast<MemRefType>($_self).getElementType()">,
+      DeclareOpInterfaceMethods<IndexedAccessOpInterface>,
     ]> {
   let summary = "atomic read-modify-write operation with a region";
   let description = [{
@@ -1243,7 +1259,8 @@ def LoadOp : MemRef_Op<"load",
       DeclareOpInterfaceMethods<AlignmentAttrOpInterface>,
       DeclareOpInterfaceMethods<MemorySpaceCastConsumerOpInterface>,
       DeclareOpInterfaceMethods<PromotableMemOpInterface>,
-      DeclareOpInterfaceMethods<DestructurableAccessorOpInterface>]> {
+      DeclareOpInterfaceMethods<DestructurableAccessorOpInterface>,
+      DeclareOpInterfaceMethods<IndexedAccessOpInterface>]> {
   let summary = "load operation";
   let description = [{
     The `load` op reads an element from a memref at the specified indices.
@@ -1404,7 +1421,9 @@ def MemRef_MemorySpaceCastOp : 
MemRef_Op<"memory_space_cast", [
 // PrefetchOp
 
//===----------------------------------------------------------------------===//
 
-def MemRef_PrefetchOp : MemRef_Op<"prefetch"> {
+def MemRef_PrefetchOp : MemRef_Op<"prefetch", [
+      DeclareOpInterfaceMethods<IndexedAccessOpInterface, ["getAccessedType"]>
+    ]> {
   let summary = "prefetch operation";
   let description = [{
     The "prefetch" op prefetches data from a memref location described with
@@ -2020,7 +2039,8 @@ def MemRef_StoreOp : MemRef_Op<"store",
       DeclareOpInterfaceMethods<AlignmentAttrOpInterface>,
       DeclareOpInterfaceMethods<MemorySpaceCastConsumerOpInterface>,
       DeclareOpInterfaceMethods<PromotableMemOpInterface>,
-      DeclareOpInterfaceMethods<DestructurableAccessorOpInterface>]> {
+      DeclareOpInterfaceMethods<DestructurableAccessorOpInterface>,
+      DeclareOpInterfaceMethods<IndexedAccessOpInterface>]> {
   let summary = "store operation";
   let description = [{
     The `store` op stores an element into a memref at the specified indices.
@@ -2493,7 +2513,8 @@ def AtomicRMWOp : MemRef_Op<"atomic_rmw", [
       AllTypesMatch<["value", "result"]>,
       TypesMatchWith<"value type matches element type of memref",
                      "memref", "value",
-                     "::llvm::cast<MemRefType>($_self).getElementType()">
+                     "::llvm::cast<MemRefType>($_self).getElementType()">,
+      DeclareOpInterfaceMethods<IndexedAccessOpInterface>
     ]> {
   let summary = "atomic read-modify-write operation";
   let description = [{
diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td 
b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td
index 73d86283a5940..89d421ea1e80f 100644
--- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td
@@ -44,6 +44,8 @@ def NVGPU_LdMatrixOp : NVGPU_Op<"ldmatrix", [
     This operation is meant to follow the semantic of described here:
     
https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-ldmatrix
 
+    Implementes IndexedAccessOpInterface.
+
     Example:
     ```mlir
     %0 = nvgpu.ldmatrix %sm[%c0, %c0] {numTiles = 4 : i32, transpose = false} :
@@ -304,12 +306,12 @@ def NVGPU_DeviceAsyncWaitOp : 
NVGPU_Op<"device_async_wait", []> {
 def NVGPU_MBarrierCreateOp : NVGPU_Op<"mbarrier.create", []> {
   let summary = "Creates a `nvgpu.mbarrier` object.";
   let description = [{
-    The Op generates one or more `mbarrier` object, which is a barrier created 
in 
+    The Op generates one or more `mbarrier` object, which is a barrier created 
in
     shared memory and supports various synchronization behaviors for threads.
 
     The `mbarrier` object has the following type and alignment requirements:
       Type: .b64, Alignment: 8, Memory space: .shared
-    
+
     Example:
     ```mlir
       %barrier = nvgpu.mbarrier.create -> !nvgpu.mbarrier.barrier<memorySpace 
= #gpu.address_space<workgroup>>
@@ -325,7 +327,7 @@ def NVGPU_MBarrierCreateOp : NVGPU_Op<"mbarrier.create", 
[]> {
 def NVGPU_MBarrierGetOp : NVGPU_Op<"mbarrier.get", []> {
   let summary = "Return a pointer to an `nvgpu.mbarrier`.";
   let description = [{
-    The `nvgpu.mbarrier.get` operation retrieves a pointer to a specific 
+    The `nvgpu.mbarrier.get` operation retrieves a pointer to a specific
     `mbarrier` object from a group of barriers created by the 
`nvgpu.mbarrier.create` operation.
 
     Example:
@@ -360,7 +362,7 @@ def NVGPU_MBarrierInitOp : NVGPU_Op<"mbarrier.init", []> {
 def NVGPU_MBarrierTestWaitOp : NVGPU_Op<"mbarrier.test.wait", []> {
   let summary = "Checks if the `nvgpu.mbarrier` has completed its current 
phase.";
   let description = [{
-    Checks whether the mbarrier object has completed the phase. It is is a 
+    Checks whether the mbarrier object has completed the phase. It is is a
     non-blocking instruction which tests for the completion of the phase.
 
     Example:
@@ -376,7 +378,7 @@ def NVGPU_MBarrierTestWaitOp : 
NVGPU_Op<"mbarrier.test.wait", []> {
 def NVGPU_MBarrierArriveOp : NVGPU_Op<"mbarrier.arrive", []> {
   let summary = "Performs arrive operation on the `nvgpu.mbarrier.arrive`.";
   let description = [{
-    The Op performs arrive-on operation on the `mbarrier` object and returns a 
+    The Op performs arrive-on operation on the `mbarrier` object and returns a
     `nvgpu.mbarrier.token`.
 
     For more information, see
@@ -395,7 +397,7 @@ let assemblyFormat = "$barriers `[` $mbarId `]` attr-dict 
`:` type($barriers) `-
 def NVGPU_MBarrierArriveNoCompleteOp : NVGPU_Op<"mbarrier.arrive.nocomplete", 
[]> {
   let summary = "Performs arrive operation on the 
`nvgpu.mbarrier.arrive.nocomplete` as non-blocking.";
   let description = [{
-    The Op performs arrive-on operation on the `mbarrier` object and returns a 
+    The Op performs arrive-on operation on the `mbarrier` object and returns a
     `nvgpu.mbarrier.token`.
 
     The Op does not cause the `nvgpu.mbarrier` to complete its current phase.
@@ -414,13 +416,13 @@ def NVGPU_MBarrierArriveNoCompleteOp : 
NVGPU_Op<"mbarrier.arrive.nocomplete", []
 def NVGPU_MBarrierArriveExpectTxOp : NVGPU_Op<"mbarrier.arrive.expect_tx", []> 
{
   let summary = "Performs expect_tx operation on the `nvgpu.mbarrier.arrive`";
   let description = [{
-    A thread executing the Op performs an expect-tx operation on the mbarrier 
-    object at the location specified by the address operand $barrier. The 
-    expect-tx operation, with an $txcount argument, increases the tx-count of 
-    an mbarrier object by the value specified by $txcount. This makes the 
-    current phase of the mbarrier object to expect and track the completion of 
+    A thread executing the Op performs an expect-tx operation on the mbarrier
+    object at the location specified by the address operand $barrier. The
+    expect-tx operation, with an $txcount argument, increases the tx-count of
+    an mbarrier object by the value specified by $txcount. This makes the
+    current phase of the mbarrier object to expect and track the completion of
     additional asynchronous transactions.
-    
+
     The `$txCount` specifies the number of element to the expect-tx operation.
 
     Example:
@@ -435,12 +437,12 @@ def NVGPU_MBarrierArriveExpectTxOp : 
NVGPU_Op<"mbarrier.arrive.expect_tx", []> {
 def NVGPU_MBarrierTryWaitParityOp : NVGPU_Op<"mbarrier.try_wait.parity", []> {
   let summary = "Waits for the `nvgpu.mbarrier` to complete its current 
phase.";
   let description = [{
-    Checks whether the mbarrier object has completed the phase. It is is a 
-    potentially blocking instruction which tests for the completion of the 
-    phase. Suspended thread resumes execution when the specified phase 
completes 
-    OR before the phase completes following a system-dependent time limit. 
+    Checks whether the mbarrier object has completed the phase. It is is a
+    potentially blocking instruction which tests for the completion of the
+    phase. Suspended thread resumes execution when the specified phase 
completes
+    OR before the phase completes following a system-dependent time limit.
 
-    The `$phaseParity` specifies either even phase (0) or odd phase (1) to 
+    The `$phaseParity` specifies either even phase (0) or odd phase (1) to
     wait.
 
     Example:
@@ -449,7 +451,7 @@ def NVGPU_MBarrierTryWaitParityOp : 
NVGPU_Op<"mbarrier.try_wait.parity", []> {
     ```
   }];
   let arguments = (ins NVGPU_MBarrierGroup:$barriers, I1:$phaseParity, 
Index:$ticks, Index:$mbarId);
-  let assemblyFormat = "$barriers `[` $mbarId `]` `,` $phaseParity `,` $ticks 
attr-dict `:` type($barriers)";  
+  let assemblyFormat = "$barriers `[` $mbarId `]` `,` $phaseParity `,` $ticks 
attr-dict `:` type($barriers)";
 }
 
 def NVGPU_TmaFenceOp : NVGPU_Op<"tma.fence.descriptor", []> {
@@ -469,7 +471,7 @@ def NVGPU_TmaFenceOp : NVGPU_Op<"tma.fence.descriptor", []> 
{
 def NVGPU_TmaPrefetchOp : NVGPU_Op<"tma.prefetch.descriptor", []> {
   let summary = "Prefetch given `nvgpu.tensormap.descriptor` ";
   let description = [{
-    The Op brings the cache line containing the given `$tmaDescriptor` for 
+    The Op brings the cache line containing the given `$tmaDescriptor` for
     subsequent use by the `tma.async.load` instruction.
   }];
   let arguments = (ins NVGPU_TensorMapDescriptor:$tensorMapDescriptor, 
Optional<I1>:$predicate);
@@ -481,27 +483,27 @@ def NVGPU_TmaPrefetchOp : 
NVGPU_Op<"tma.prefetch.descriptor", []> {
 def NVGPU_TmaAsyncLoadOp : NVGPU_Op<"tma.async.load", 
[AttrSizedOperandSegments]> {
   let summary = "TMA asynchronous load";
   let description = [{
-    The Op loads a tile memory region from global memory to shared memory by 
+    The Op loads a tile memory region from global memory to shared memory by
     Tensor Memory Access (TMA).
-    
+
     `$tensorMapDescriptor` is tensor map descriptor which has information about
     tile shape. The descriptor is created by `nvgpu.tma.create.descriptor`
 
-    The Op uses `$barrier` mbarrier based completion mechanism. 
-  }];  
+    The Op uses `$barrier` mbarrier based completion mechanism.
+  }];
   let arguments = (ins  Arg<AnyMemRef, "", [MemWriteAt<0, FullEffect>]>:$dst,
                         NVGPU_MBarrierGroup:$barriers,
                         NVGPU_TensorMapDescriptor:$tensorMapDescriptor,
-                        Variadic<Index>:$coordinates, 
+                        Variadic<Index>:$coordinates,
                         Index:$mbarId,
                         Optional<I16>:$multicastMask,
                         Optional<I1>:$predicate);
   let assemblyFormat = [{
-    $tensorMapDescriptor `[` $coordinates `]` `,` $barriers `[` $mbarId `]` 
+    $tensorMapDescriptor `[` $coordinates `]` `,` $barriers `[` $mbarId `]`
       `to` $dst
       (`multicast_mask` `=` $multicastMask^ )?
       (`,` `predicate` `=` $predicate^)?
-      attr-dict `:` type($tensorMapDescriptor) `,` type($barriers) 
+      attr-dict `:` type($tensorMapDescriptor) `,` type($barriers)
       `->` type($dst)
   }];
   let hasVerifier = 1;
@@ -511,15 +513,15 @@ def NVGPU_TmaAsyncLoadOp : NVGPU_Op<"tma.async.load", 
[AttrSizedOperandSegments]
 def NVGPU_TmaAsyncStoreOp : NVGPU_Op<"tma.async.store", 
[AttrSizedOperandSegments]> {
   let summary = "TMA asynchronous store";
   let description = [{
-    The Op store a tile memory region from global memory to shared memory by 
+    The Op store a tile memory region from global memory to shared memory by
     Tensor Memory Access (TMA).
-    
+
     `$tensorMapDescriptor` is tensor map descriptor which has information about
     tile shape. The descriptor is created by `nvgpu.tma.create.descriptor`
-  }];  
+  }];
   let arguments = (ins  Arg<AnyMemRef, "", [MemReadAt<0, FullEffect>]>:$src,
                         Arg<NVGPU_TensorMapDescriptor, "", [MemWriteAt<0, 
FullEffect>]>:$tensorMapDescriptor,
-                        Variadic<Index>:$coordinates, 
+                        Variadic<Index>:$coordinates,
                         Optional<I1>:$predicate);
   let assemblyFormat = [{
       $src `to` $tensorMapDescriptor `[` $coordinates `]`
@@ -533,11 +535,11 @@ def NVGPU_TmaAsyncStoreOp : NVGPU_Op<"tma.async.store", 
[AttrSizedOperandSegment
 def NVGPU_TmaCreateDescriptorOp : NVGPU_Op<"tma.create.descriptor", []> {
   let summary = "TMA create descriptor";
   let description = [{
-    The Op creates a tensor map descriptor object representing tiled memory 
-    region. To do that it calls CUDA Driver's `cuTensorMapEncodeTiled`. The 
+    The Op creates a tensor map descriptor object representing tiled memory
+    region. To do that it calls CUDA Driver's `cuTensorMapEncodeTiled`. The
     descriptor is used by Tensor Memory Access (TMA).
 
-    The `tensor` is the source tensor to be tiled. 
+    The `tensor` is the source tensor to be tiled.
 
     The `boxDimensions` is the size of the tiled memory region in each 
dimension.
 
@@ -557,15 +559,15 @@ def NVGPU_TmaCreateDescriptorOp : 
NVGPU_Op<"tma.create.descriptor", []> {
 def NVGPU_WarpgroupGenerateDescriptorOp : 
NVGPU_Op<"warpgroup.generate.descriptor", []> {
   let summary = "Generate a warpgroup matrix descriptor";
   let description = [{
-  This Op builds a `nvgpu.warpgroup.descriptor` that is used by 
-  `nvgpu.warpgroup.mma` to perform warpgroup-level matrix multiply and 
+  This Op builds a `nvgpu.warpgroup.descriptor` that is used by
+  `nvgpu.warpgroup.mma` to perform warpgroup-level matrix multiply and
   accumulate.
 
-  The descriptor specifies the properties of the matrix in shared memory that 
-  is a multiplicand in the matrix multiply and accumulate operation. 
-  }];  
+  The descriptor specifies the properties of the matrix in shared memory that
+  is a multiplicand in the matrix multiply and accumulate operation.
+  }];
   let results = (outs NVGPU_WarpgroupMatrixDescriptor:$descriptor);
-  let arguments = (ins Arg<AnyMemRef, "", [MemRead]>:$tensor, 
+  let arguments = (ins Arg<AnyMemRef, "", [MemRead]>:$tensor,
                        NVGPU_TensorMapDescriptor:$tensorMap);
   let assemblyFormat = [{$tensor `,` $tensorMap attr-dict `:` type($tensor) 
`,` type($tensorMap) `->` type($descriptor)}];
   let hasVerifier = 1;
@@ -573,42 +575,42 @@ def NVGPU_WarpgroupGenerateDescriptorOp : 
NVGPU_Op<"warpgroup.generate.descripto
 
 def NVGPU_WarpgroupMmaOp : NVGPU_Op<"warpgroup.mma"> {
   let description = [{
-    The `nvgpu.warpgroup.mma` op performs the warpgroup-level (4 warps) 
-    matrix-multiply-and-accumulate (mma) operation that results in 
-    `nvvm.wgmma.mma_async`. 
-    
-    The operands are `descriptorA` and `descriptorB` that are wgmma matrix 
-    descriptors that shows the properties of the matrix in shared memory. The 
-    results are thread-level ownership to the warpgroup-level mma operation 
+    The `nvgpu.warpgroup.mma` op performs the warpgroup-level (4 warps)
+    matrix-multiply-and-accumulate (mma) operation that results in
+    `nvvm.wgmma.mma_async`.
+
+    The operands are `descriptorA` and `descriptorB` that are wgmma matrix
+    descriptors that shows the properties of the matrix in shared memory. The
+    results are thread-level ownership to the warpgroup-level mma operation
     shape. The shape is deduced from the descriptor types and output vector.
 
-    The Op encapsulates multiple `nvvm.wgmma.mma_async` operations to complete 
-    the given shape. As `nvvm.wgmma.async` Op, or its corresponding PTX 
-    instruction, is asynchronous, this Op groups the `nvvm.wgmma.async` and 
-    surrounds them between `wgmma.fence.aligned` and 
+    The Op encapsulates multiple `nvvm.wgmma.mma_async` operations to complete
+    the given shape. As `nvvm.wgmma.async` Op, or its corresponding PTX
+    instruction, is asynchronous, this Op groups the `nvvm.wgmma.async` and
+    surrounds them between `wgmma.fence.aligned` and
     `wgmma.commit.group.sync.aligned`, `wgmma.wait.group.sync.aligned` Ops.
 
     Example:
     ```mlir
-      %r1,%r2 = nvgpu.warpgroup.mma %descA, %descB, %acc1, %acc2: 
-                 !nvgpu.warpgroup.descriptor<tensor = memref<128x64xf16, 3>>, 
-                 !nvgpu.warpgroup.descriptor<tensor = memref<64x128xf16, 3>>, 
+      %r1,%r2 = nvgpu.warpgroup.mma %descA, %descB, %acc1, %acc2:
+                 !nvgpu.warpgroup.descriptor<tensor = memref<128x64xf16, 3>>,
+                 !nvgpu.warpgroup.descriptor<tensor = memref<64x128xf16, 3>>,
                  !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>,
                  !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>
-                 -> 
+                 ->
                  !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>,
                  !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>
     ```
   }];
 
-  let arguments = (ins NVGPU_WarpgroupMatrixDescriptor:$descriptorA, 
-                       NVGPU_WarpgroupMatrixDescriptor:$descriptorB,           
                                    
+  let arguments = (ins NVGPU_WarpgroupMatrixDescriptor:$descriptorA,
+                       NVGPU_WarpgroupMatrixDescriptor:$descriptorB,
                        DefaultValuedOptionalAttr<I64Attr, "1">:$waitGroup,
                        OptionalAttr<UnitAttr>:$transposeA,
                        OptionalAttr<UnitAttr>:$transposeB,
                        NVGPU_WarpgroupAccumulator:$matrixC);
   let results = (outs NVGPU_WarpgroupAccumulator:$matrixD);
-  let assemblyFormat = [{    
+  let assemblyFormat = [{
     $descriptorA`,` $descriptorB`,` $matrixC attr-dict
     `:` type($descriptorA) `,` type($descriptorB) `,` type($matrixC) `->` 
type($matrixD)
   }];
@@ -617,29 +619,29 @@ def NVGPU_WarpgroupMmaOp : NVGPU_Op<"warpgroup.mma"> {
 
 def NVGPU_WarpgroupMmaStoreOp : NVGPU_Op<"warpgroup.mma.store"> {
   let description = [{
-    The `nvgpu.warpgroup.mma.store` op performs the store of fragmented result 
-    in $matrixD to given memref. 
+    The `nvgpu.warpgroup.mma.store` op performs the store of fragmented result
+    in $matrixD to given memref.
 
     [See the details of register fragment layout for accumulator matrix D]
-    
(https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#wgmma-64n16-d)
 
+    
(https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#wgmma-64n16-d)
 
     Note that, the op must be run with warp group.
   }];
 
   let arguments = (ins NVGPU_WarpgroupAccumulator:$matrixD,
                        Arg<AnyMemRef, "", [MemWrite]>:$dstMemref);
-  
+
   let assemblyFormat = [{
     $matrixD `,` $dstMemref attr-dict `:` type($matrixD) `to` type($dstMemref)
   }];
   let hasVerifier = 1;
 }
 
-def NVGPU_WarpgroupMmaInitAccumulatorOp : 
NVGPU_Op<"warpgroup.mma.init.accumulator"> {  
+def NVGPU_WarpgroupMmaInitAccumulatorOp : 
NVGPU_Op<"warpgroup.mma.init.accumulator"> {
   let summary = "Initializes the accumulator matrix";
 
   let description = [{
-    This Op generates and initializes the accumulator matrix for 
+    This Op generates and initializes the accumulator matrix for
     `nvgpu.warpgroup.mma` op to perform matrix-multiply-and-accumulate.
   }];
   let results = (outs NVGPU_WarpgroupAccumulator:$matrixC);
@@ -662,7 +664,7 @@ def NVGPU_RcpOp : NVGPU_Op<"rcp", [Pure,
                        UnitAttr:$ftz);
   let results = (outs VectorOfNonZeroRankOf<[F32]>:$out);
   let assemblyFormat = [{
-    $in `{` `rounding` `=` $rounding (`,` `ftz` $ftz^)? `}` 
+    $in `{` `rounding` `=` $rounding (`,` `ftz` $ftz^)? `}`
     attr-dict `:` type($out)
   }];
   let hasVerifier = 1;
diff --git 
a/mlir/include/mlir/Dialect/NVGPU/Transforms/MemoryAccessOpInterfacesImpl.h 
b/mlir/include/mlir/Dialect/NVGPU/Transforms/MemoryAccessOpInterfacesImpl.h
new file mode 100644
index 0000000000000..50d2223912a27
--- /dev/null
+++ b/mlir/include/mlir/Dialect/NVGPU/Transforms/MemoryAccessOpInterfacesImpl.h
@@ -0,0 +1,21 @@
+//===- MemoryAccessOpInterfacesImpl.h 
-------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_NVGPU_TRANSFORMS_MEMORYACCESSOPINTERFACESIMPL_H
+#define MLIR_DIALECT_NVGPU_TRANSFORMS_MEMORYACCESSOPINTERFACESIMPL_H
+
+namespace mlir {
+
+class DialectRegistry;
+
+namespace nvgpu {
+void registerMemoryAccessOpInterfacesExternalModels(DialectRegistry &registry);
+} // namespace nvgpu
+} // namespace mlir
+
+#endif // MLIR_DIALECT_NVGPU_TRANSFORMS_MEMORYACCESSOPINTERFACESIMPL_H
diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td 
b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
index ddb04b6bbe40d..fc4932f8ee6c3 100644
--- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
@@ -1709,6 +1709,8 @@ def Vector_LoadOp : Vector_Op<"load", [
     load operation. It must be a positive power of 2. The operation must access
     memory at an address aligned to this boundary. Violating this requirement
     triggers immediate undefined behavior.
+
+    Implements IndexedAccessOpInterface.
   }];
 
   let arguments = (ins Arg<AnyMemRef, "the reference to load from",
@@ -1825,6 +1827,8 @@ def Vector_StoreOp : Vector_Op<"store", [
     store operation. It must be a positive power of 2. The operation must 
access
     memory at an address aligned to this boundary. Violating this requirement
     triggers immediate undefined behavior.
+
+    Implements IndexedAccessOpInterface.
   }];
 
   let arguments = (ins
@@ -1900,6 +1904,8 @@ def Vector_MaskedLoadOp :
     closely correspond to those of the `llvm.masked.load`
     
[intrinsic](https://llvm.org/docs/LangRef.html#llvm-masked-load-intrinsics).
 
+    Implements IndexedAccessOpInterface.
+
     Examples:
 
     ```mlir
@@ -1994,6 +2000,8 @@ def Vector_MaskedStoreOp :
     closely correspond to those of the `llvm.masked.store`
     
[intrinsic](https://llvm.org/docs/LangRef.html#llvm-masked-store-intrinsics).
 
+    Implements IndexedAccessOpInterface.
+
     Examples:
 
     ```mlir
@@ -2103,6 +2111,14 @@ def Vector_GatherOp :
     memory at an address aligned to this boundary. Violating this requirement
     triggers immediate undefined behavior.
 
+    Note that if the base argument is not contiguous in memory (for example,
+    it is the result of a `tensor.extract` or a `memref.subview`), the entries
+    of `index_vec` must respect the strieds in the underlying memory and are
+    applied as pure 1-D offsets and are not decomposed in order to apply
+    the strides on the base.
+
+    Implements IndexedAccessOpInterface.
+
     Examples:
 
     ```mlir
@@ -2196,6 +2212,14 @@ def Vector_ScatterOp
     memory at an address aligned to this boundary. Violating this requirement
     triggers immediate undefined behavior.
 
+    Note that if the base argument is not contiguous in memory (for example,
+    it is the result of a `tensor.extract` or a `memref.subview`), the entries
+    of `index_vec` must respect the strieds in the underlying memory and are
+    applied as pure 1-D offsets and are not decomposed in order to apply
+    the strides on the base.
+
+    Implements IndexedAccessOpInterface.
+
     Examples:
 
     ```mlir
@@ -2280,6 +2304,8 @@ def Vector_ExpandLoadOp :
 
     Note, at the moment this Op is only available for fixed-width vectors.
 
+    Implements IndexedAccessOpInterface.
+
     Examples:
 
     ```mlir
@@ -2370,6 +2396,8 @@ def Vector_CompressStoreOp :
 
     Note, at the moment this Op is only available for fixed-width vectors.
 
+    Implements IndexedAccessOpInterface.
+
     Examples:
 
     ```mlir
@@ -2534,7 +2562,7 @@ def Vector_TypeCastOp :
 }
 
 def Vector_ConstantMaskOp :
-  Vector_Op<"constant_mask", [Pure, 
+  Vector_Op<"constant_mask", [Pure,
    DeclareOpInterfaceMethods<VectorUnrollOpInterface>
    ]>,
     Arguments<(ins DenseI64ArrayAttr:$mask_dim_sizes)>,
@@ -2594,7 +2622,7 @@ def Vector_ConstantMaskOp :
 }
 
 def Vector_CreateMaskOp :
-  Vector_Op<"create_mask", [Pure, 
+  Vector_Op<"create_mask", [Pure,
    DeclareOpInterfaceMethods<VectorUnrollOpInterface>
    ]>,
     Arguments<(ins Variadic<Index>:$operands)>,
diff --git 
a/mlir/include/mlir/Dialect/Vector/Transforms/IndexedAccessOpInterfaceImpl.h 
b/mlir/include/mlir/Dialect/Vector/Transforms/IndexedAccessOpInterfaceImpl.h
new file mode 100644
index 0000000000000..8393b6efd14cc
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Vector/Transforms/IndexedAccessOpInterfaceImpl.h
@@ -0,0 +1,21 @@
+//===- IndexedAccessOpInterfaceImpl.h 
-------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_VECTOR_TRANSFORMS_INDEXEDACCESSOPINTERFACEIMPL_H
+#define MLIR_DIALECT_VECTOR_TRANSFORMS_INDEXEDACCESSOPINTERFACEIMPL_H
+
+namespace mlir {
+
+class DialectRegistry;
+
+namespace vector {
+void registerIndexedAccessOpInterfaceExternalModels(DialectRegistry &registry);
+} // namespace vector
+} // namespace mlir
+
+#endif // MLIR_DIALECT_VECTOR_TRANSFORMS_INDEXEDACCESSOPINTERFACEIMPL_H
diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt 
b/mlir/lib/Dialect/GPU/CMakeLists.txt
index f2f010a771b77..547812da0ab97 100644
--- a/mlir/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/CMakeLists.txt
@@ -34,6 +34,7 @@ add_mlir_dialect_library(MLIRGPUTransforms
   Transforms/EliminateBarriers.cpp
   Transforms/GlobalIdRewriter.cpp
   Transforms/KernelOutlining.cpp
+  Transforms/IndexedAccessOpInterfaceImpl.cpp
   Transforms/MemoryPromotion.cpp
   Transforms/ModuleToBinary.cpp
   Transforms/NVVMAttachTarget.cpp
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp 
b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index 033a94e3f8fce..4f7f54a850c45 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -283,6 +283,9 @@ void GPUDialect::initialize() {
   addInterfaces<GPUInlinerInterface>();
   declarePromisedInterface<bufferization::BufferDeallocationOpInterface,
                            TerminatorOp>();
+  declarePromisedInterfaces<memref::IndexedAccessOpInterface,
+                            SubgroupMmaLoadMatrixOp,
+                            SubgroupMmaStoreMatrixOp>();
   declarePromisedInterfaces<
       ValueBoundsOpInterface, ClusterDimOp, ClusterDimBlocksOp, ClusterIdOp,
       ClusterBlockIdOp, BlockDimOp, BlockIdOp, GridDimOp, ThreadIdOp, LaneIdOp,
diff --git a/mlir/lib/Dialect/GPU/Transforms/IndexedAccessOpInterfaceImpl.cpp 
b/mlir/lib/Dialect/GPU/Transforms/IndexedAccessOpInterfaceImpl.cpp
new file mode 100644
index 0000000000000..9263745f69413
--- /dev/null
+++ b/mlir/lib/Dialect/GPU/Transforms/IndexedAccessOpInterfaceImpl.cpp
@@ -0,0 +1,115 @@
+//===- IndexedAccessOpInterfaceImpl.cpp 
-----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/GPU/Transforms/IndexedAccessOpInterfaceImpl.h"
+
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/MemRef/IR/MemoryAccessOpInterfaces.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/PatternMatch.h"
+
+using namespace mlir;
+using namespace mlir::memref;
+using namespace mlir::gpu;
+
+/// Given a GPU matrix type that will be loaded or stored, the leading 
dimension
+/// of the matrix in memory, and whether or not the matrix is transposed,
+/// compute the size of the linear memory that the load/store spans as
+/// dC + leadingDim * (dR - 1) where dR and dC are the non-contiguous and
+/// contiguous matrix dimensions, respectively (we get to the dX-1th row and
+/// then access the first dY elements of it).
+static int64_t get1DAccessSize(MMAMatrixType matrixType, int64_t leadingDim,
+                               bool transpose) {
+  assert(matrixType.getShape().size() == 2 && "expected matrices to be 2D");
+
+  int64_t c = matrixType.getShape()[1];
+  int64_t r = matrixType.getShape()[0];
+  if (transpose)
+    std::swap(c, r);
+  return c + leadingDim * (r - 1);
+}
+
+namespace {
+struct SubgroupMmaLoadMatrixOpImpl final
+    : IndexedAccessOpInterface::ExternalModel<SubgroupMmaLoadMatrixOpImpl,
+                                              SubgroupMmaLoadMatrixOp> {
+  TypedValue<MemRefType> getMemref(Operation *op) const {
+    return cast<SubgroupMmaLoadMatrixOp>(op).getSrcMemref();
+  }
+
+  Operation::operand_range getIndices(Operation *op) const {
+    return cast<SubgroupMmaLoadMatrixOp>(op).getIndices();
+  }
+
+  /// This returns a 1-D shape so that it's clear that both linearization and
+  /// folding in expand/collapse_shape operations are allowed.
+  SmallVector<int64_t> getAccessedShape(Operation *op) const {
+    auto loadOp = cast<SubgroupMmaLoadMatrixOp>(op);
+    return {get1DAccessSize(cast<MMAMatrixType>(loadOp.getRes().getType()),
+                            loadOp.getLeadDimension().getZExtValue(),
+                            loadOp.getTranspose().value_or(false))};
+  }
+
+  std::optional<SmallVector<Value>>
+  updateMemrefAndIndices(Operation *op, RewriterBase &rewriter, Value 
newMemref,
+                         ValueRange newIndices) const {
+    auto loadOp = cast<SubgroupMmaLoadMatrixOp>(op);
+    rewriter.modifyOpInPlace(loadOp, [&]() {
+      loadOp.getSrcMemrefMutable().assign(newMemref);
+      loadOp.getIndicesMutable().assign(newIndices);
+    });
+    return std::nullopt;
+  }
+
+  bool hasInboundsIndices(Operation *) const { return true; }
+};
+
+struct SubgroupMmaStoreMatrixOpImpl final
+    : IndexedAccessOpInterface::ExternalModel<SubgroupMmaStoreMatrixOpImpl,
+                                              SubgroupMmaStoreMatrixOp> {
+  TypedValue<MemRefType> getMemref(Operation *op) const {
+    return cast<SubgroupMmaStoreMatrixOp>(op).getDstMemref();
+  }
+
+  Operation::operand_range getIndices(Operation *op) const {
+    return cast<SubgroupMmaStoreMatrixOp>(op).getIndices();
+  }
+
+  /// This returns a 1-D shape so that it's clear that both linearization and
+  /// folding in expand/collapse_shape operations are allowed.
+  SmallVector<int64_t> getAccessedShape(Operation *op) const {
+    auto storeOp = cast<SubgroupMmaStoreMatrixOp>(op);
+    return {get1DAccessSize(storeOp.getSrc().getType(),
+                            storeOp.getLeadDimension().getZExtValue(),
+                            storeOp.getTranspose().value_or(false))};
+  }
+
+  std::optional<SmallVector<Value>>
+  updateMemrefAndIndices(Operation *op, RewriterBase &rewriter, Value 
newMemref,
+                         ValueRange newIndices) const {
+    auto storeOp = cast<SubgroupMmaStoreMatrixOp>(op);
+    rewriter.modifyOpInPlace(storeOp, [&]() {
+      storeOp.getDstMemrefMutable().assign(newMemref);
+      storeOp.getIndicesMutable().assign(newIndices);
+    });
+    return std::nullopt;
+  }
+
+  bool hasInboundsIndices(Operation *) const { return true; }
+};
+} // namespace
+
+void mlir::gpu::registerIndexedAccessOpInterfaceExternalModels(
+    DialectRegistry &registry) {
+  registry.addExtension(+[](MLIRContext *ctx, gpu::GPUDialect *dialect) {
+    
SubgroupMmaLoadMatrixOp::attachInterface<SubgroupMmaLoadMatrixOpImpl>(*ctx);
+    SubgroupMmaStoreMatrixOp::attachInterface<SubgroupMmaStoreMatrixOpImpl>(
+        *ctx);
+  });
+}
diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp 
b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
index a9103b4d438ea..4128c6a751847 100644
--- a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
+++ b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
@@ -1330,6 +1330,26 @@ LogicalResult DmaStartOp::fold(FoldAdaptor adaptor,
   return foldMemRefCast(*this);
 }
 
+void DmaStartOp::setMemrefsAndIndices(RewriterBase &rewriter, Value newSrc,
+                                      ValueRange newSrcIndices, Value newDst,
+                                      ValueRange newDstIndices) {
+  /// dma_start has special handling for variadic rank
+  SmallVector<Value> newOperands;
+  newOperands.push_back(newSrc);
+  llvm::append_range(newOperands, newSrcIndices);
+  newOperands.push_back(newDst);
+  llvm::append_range(newOperands, newDstIndices);
+  newOperands.push_back(getNumElements());
+  newOperands.push_back(getTagMemRef());
+  llvm::append_range(newOperands, getTagIndices());
+  if (isStrided()) {
+    newOperands.push_back(getStride());
+    newOperands.push_back(getNumElementsPerStride());
+  }
+
+  rewriter.modifyOpInPlace(*this, [&]() { (*this)->setOperands(newOperands); 
});
+}
+
 // ---------------------------------------------------------------------------
 // DmaWaitOp
 // ---------------------------------------------------------------------------
@@ -1558,6 +1578,15 @@ void GenericAtomicRMWOp::print(OpAsmPrinter &p) {
   p.printOptionalAttrDict((*this)->getAttrs());
 }
 
+std::optional<SmallVector<Value>> GenericAtomicRMWOp::updateMemrefAndIndices(
+    RewriterBase &rewriter, Value newMemref, ValueRange newIndices) {
+  rewriter.modifyOpInPlace(*this, [&]() {
+    getMemrefMutable().assign(newMemref);
+    getIndicesMutable().assign(newIndices);
+  });
+  return std::nullopt;
+}
+
 
//===----------------------------------------------------------------------===//
 // AtomicYieldOp
 
//===----------------------------------------------------------------------===//
@@ -1725,6 +1754,16 @@ OpFoldResult LoadOp::fold(FoldAdaptor adaptor) {
   return splatAttr.getSplatValue<Attribute>();
 }
 
+std::optional<SmallVector<Value>>
+LoadOp::updateMemrefAndIndices(RewriterBase &rewriter, Value newMemref,
+                               ValueRange newIndices) {
+  rewriter.modifyOpInPlace(*this, [&]() {
+    getMemrefMutable().assign(newMemref);
+    getIndicesMutable().assign(newIndices);
+  });
+  return std::nullopt;
+}
+
 FailureOr<std::optional<SmallVector<Value>>>
 LoadOp::bubbleDownCasts(OpBuilder &builder) {
   return mlir::detail::bubbleDownInPlaceMemorySpaceCastImpl(getMemrefMutable(),
@@ -1868,6 +1907,16 @@ LogicalResult PrefetchOp::fold(FoldAdaptor adaptor,
   return foldMemRefCast(*this);
 }
 
+std::optional<SmallVector<Value>>
+PrefetchOp::updateMemrefAndIndices(RewriterBase &rewriter, Value newMemref,
+                                   ValueRange newIndices) {
+  rewriter.modifyOpInPlace(*this, [&]() {
+    getMemrefMutable().assign(newMemref);
+    getIndicesMutable().assign(newIndices);
+  });
+  return std::nullopt;
+}
+
 
//===----------------------------------------------------------------------===//
 // RankOp
 
//===----------------------------------------------------------------------===//
@@ -2902,6 +2951,16 @@ LogicalResult StoreOp::fold(FoldAdaptor adaptor,
   return foldMemRefCast(*this, getValueToStore());
 }
 
+std::optional<SmallVector<Value>>
+StoreOp::updateMemrefAndIndices(RewriterBase &rewriter, Value newMemref,
+                                ValueRange newIndices) {
+  rewriter.modifyOpInPlace(*this, [&]() {
+    getMemrefMutable().assign(newMemref);
+    getIndicesMutable().assign(newIndices);
+  });
+  return std::nullopt;
+}
+
 FailureOr<std::optional<SmallVector<Value>>>
 StoreOp::bubbleDownCasts(OpBuilder &builder) {
   return mlir::detail::bubbleDownInPlaceMemorySpaceCastImpl(getMemrefMutable(),
@@ -3940,6 +3999,16 @@ AtomicRMWOp::bubbleDownCasts(OpBuilder &builder) {
                                                             getResult());
 }
 
+std::optional<SmallVector<Value>>
+AtomicRMWOp::updateMemrefAndIndices(RewriterBase &rewriter, Value newMemref,
+                                    ValueRange newIndices) {
+  rewriter.modifyOpInPlace(*this, [&]() {
+    getMemrefMutable().assign(newMemref);
+    getIndicesMutable().assign(newIndices);
+  });
+  return std::nullopt;
+}
+
 
//===----------------------------------------------------------------------===//
 // TableGen'd op method definitions
 
//===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp 
b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
index 237aab4d7f309..b60658c7e3041 100644
--- a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
+++ b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
@@ -12,6 +12,7 @@
 
 #include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/MemRef/IR/MemoryAccessOpInterfaces.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinTypes.h"
@@ -40,6 +41,9 @@ void NVGPUDialect::initialize() {
 #define GET_OP_LIST
 #include "mlir/Dialect/NVGPU/IR/NVGPUOps.cpp.inc"
       >();
+  declarePromisedInterfaces<memref::IndexedAccessOpInterface, LdMatrixOp>();
+  declarePromisedInterfaces<memref::IndexedMemCopyOpInterface,
+                            DeviceAsyncCopyOp>();
 }
 
 bool NVGPUDialect::isSharedMemoryAddressSpace(Attribute memorySpace) {
diff --git a/mlir/lib/Dialect/NVGPU/Transforms/CMakeLists.txt 
b/mlir/lib/Dialect/NVGPU/Transforms/CMakeLists.txt
index 3f967d2b189be..8852ed7fb30a8 100644
--- a/mlir/lib/Dialect/NVGPU/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/NVGPU/Transforms/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_mlir_dialect_library(MLIRNVGPUTransforms
   CreateAsyncGroups.cpp
+  MemoryAccessOpInterfacesImpl.cpp
   OptimizeSharedMemory.cpp
   MmaSyncTF32Transform.cpp
   Utils.cpp
diff --git a/mlir/lib/Dialect/NVGPU/Transforms/MemoryAccessOpInterfacesImpl.cpp 
b/mlir/lib/Dialect/NVGPU/Transforms/MemoryAccessOpInterfacesImpl.cpp
new file mode 100644
index 0000000000000..a383b831deb76
--- /dev/null
+++ b/mlir/lib/Dialect/NVGPU/Transforms/MemoryAccessOpInterfacesImpl.cpp
@@ -0,0 +1,90 @@
+//===- MemoryAccessOpInterfacesImpl.cpp 
-----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/NVGPU/Transforms/MemoryAccessOpInterfacesImpl.h"
+
+#include "mlir/Dialect/MemRef/IR/MemoryAccessOpInterfaces.h"
+#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/PatternMatch.h"
+
+using namespace mlir;
+using namespace mlir::memref;
+using namespace mlir::nvgpu;
+
+namespace {
+struct LdMatrixOpInterface final
+    : IndexedAccessOpInterface::ExternalModel<LdMatrixOpInterface, LdMatrixOp> 
{
+  TypedValue<MemRefType> getMemref(Operation *op) const {
+    return cast<LdMatrixOp>(op).getSrcMemref();
+  }
+
+  Operation::operand_range getIndices(Operation *op) const {
+    return cast<LdMatrixOp>(op).getIndices();
+  }
+
+  SmallVector<int64_t> getAccessedShape(Operation *op) const {
+    VectorType vecTy = cast<LdMatrixOp>(op).getRes().getType();
+    return llvm::to_vector(vecTy.getShape());
+  }
+
+  std::optional<SmallVector<Value>>
+  updateMemrefAndIndices(Operation *op, RewriterBase &rewriter, Value 
newMemref,
+                         ValueRange newIndices) const {
+    auto ldMatrixOp = cast<LdMatrixOp>(op);
+    rewriter.modifyOpInPlace(ldMatrixOp, [&]() {
+      ldMatrixOp.getSrcMemrefMutable().assign(newMemref);
+      ldMatrixOp.getIndicesMutable().assign(newIndices);
+    });
+    return std::nullopt;
+  }
+
+  bool hasInboundsIndices(Operation *) const { return true; }
+};
+
+struct DeviceAsyncCopyOpInterface final
+    : IndexedMemCopyOpInterface::ExternalModel<DeviceAsyncCopyOpInterface,
+                                               DeviceAsyncCopyOp> {
+  TypedValue<MemRefType> getSrc(Operation *op) const {
+    return cast<DeviceAsyncCopyOp>(op).getSrc();
+  }
+
+  Operation::operand_range getSrcIndices(Operation *op) const {
+    return cast<DeviceAsyncCopyOp>(op).getSrcIndices();
+  }
+
+  TypedValue<MemRefType> getDst(Operation *op) const {
+    return cast<DeviceAsyncCopyOp>(op).getDst();
+  }
+
+  Operation::operand_range getDstIndices(Operation *op) const {
+    return cast<DeviceAsyncCopyOp>(op).getDstIndices();
+  }
+
+  void setMemrefsAndIndices(Operation *op, RewriterBase &rewriter, Value 
newSrc,
+                            ValueRange newSrcIndices, Value newDst,
+                            ValueRange newDstIndices) const {
+    auto copyOp = cast<DeviceAsyncCopyOp>(op);
+    rewriter.modifyOpInPlace(copyOp, [&]() {
+      copyOp.getSrcMutable().assign(newSrc);
+      copyOp.getSrcIndicesMutable().assign(newSrcIndices);
+      copyOp.getDstMutable().assign(newDst);
+      copyOp.getDstIndicesMutable().assign(newDstIndices);
+    });
+  }
+};
+} // namespace
+
+void mlir::nvgpu::registerMemoryAccessOpInterfacesExternalModels(
+    DialectRegistry &registry) {
+  registry.addExtension(+[](MLIRContext *ctx, nvgpu::NVGPUDialect *dialect) {
+    LdMatrixOp::attachInterface<LdMatrixOpInterface>(*ctx);
+    DeviceAsyncCopyOp::attachInterface<DeviceAsyncCopyOpInterface>(*ctx);
+  });
+}
diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp 
b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index 085f879c2d0e6..7afcd48e475db 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -19,6 +19,7 @@
 #include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/MemRef/IR/MemoryAccessOpInterfaces.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/UB/IR/UBOps.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
@@ -485,6 +486,9 @@ void VectorDialect::initialize() {
                             YieldOp>();
   declarePromisedInterfaces<SubsetOpInterface, TransferReadOp,
                             TransferWriteOp>();
+  declarePromisedInterfaces<memref::IndexedAccessOpInterface, LoadOp, StoreOp,
+                            MaskedLoadOp, MaskedStoreOp, ExpandLoadOp,
+                            CompressStoreOp, GatherOp, ScatterOp>();
   declarePromisedInterface<SubsetExtractionOpInterface, TransferReadOp>();
   declarePromisedInterface<SubsetInsertionOpInterface, TransferWriteOp>();
   declarePromisedInterface<ConvertToLLVMPatternInterface, VectorDialect>();
diff --git a/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt 
b/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt
index 4e0f07af95984..112a1db6fe93b 100644
--- a/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_mlir_dialect_library(MLIRVectorTransforms
   BufferizableOpInterfaceImpl.cpp
+  IndexedAccessOpInterfaceImpl.cpp
   LowerVectorBitCast.cpp
   LowerVectorBroadcast.cpp
   LowerVectorContract.cpp
diff --git 
a/mlir/lib/Dialect/Vector/Transforms/IndexedAccessOpInterfaceImpl.cpp 
b/mlir/lib/Dialect/Vector/Transforms/IndexedAccessOpInterfaceImpl.cpp
new file mode 100644
index 0000000000000..3dbc5c93dc085
--- /dev/null
+++ b/mlir/lib/Dialect/Vector/Transforms/IndexedAccessOpInterfaceImpl.cpp
@@ -0,0 +1,162 @@
+//===- IndexedAccessOpInterfaceImpl.cpp 
-----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Vector/Transforms/IndexedAccessOpInterfaceImpl.h"
+
+#include "mlir/Dialect/MemRef/IR/MemoryAccessOpInterfaces.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/Operation.h"
+
+using namespace mlir;
+using namespace mlir::memref;
+using namespace mlir::vector;
+
+namespace {
+template <typename LoadStoreOp>
+struct VectorLoadStoreLikeOpInterface final
+    : IndexedAccessOpInterface::ExternalModel<
+          VectorLoadStoreLikeOpInterface<LoadStoreOp>, LoadStoreOp> {
+  TypedValue<MemRefType> getMemref(Operation *op) const {
+    return dyn_cast<TypedValue<MemRefType>>(cast<LoadStoreOp>(op).getBase());
+  }
+
+  Operation::operand_range getIndices(Operation *op) const {
+    return cast<LoadStoreOp>(op).getIndices();
+  }
+
+  // Note: this is an upper bound on what's accessed in the case of operations
+  // like expandload or compressstore.
+  SmallVector<int64_t> getAccessedShape(Operation *op) const {
+    VectorType vecTy = cast<LoadStoreOp>(op).getVectorType();
+    // Drop leading unit dimensions, since they don't impact the vector
+    // semantics of operations. That is, none of these load/store variants
+    // change their behavior if the loaded/stored vector type is changed from
+    // vector<1x...x1x[shape]xT> to vector<[shape]xT>.
+    SmallVector<int64_t> result(
+        vecTy.getShape().drop_while([](int64_t l) { return l == 1; }));
+    return result;
+  }
+
+  std::optional<SmallVector<Value>>
+  updateMemrefAndIndices(Operation *op, RewriterBase &rewriter, Value 
newMemref,
+                         ValueRange newIndices) const {
+    VectorType vecTy = cast<LoadStoreOp>(op).getVectorType();
+    bool droppedUnitDims =
+        static_cast<int64_t>(newIndices.size()) < vecTy.getRank();
+    if (LLVM_LIKELY(!droppedUnitDims)) {
+      rewriter.modifyOpInPlace(op, [&]() {
+        auto concreteOp = cast<LoadStoreOp>(op);
+        concreteOp.getBaseMutable().assign(newMemref);
+        concreteOp.getIndicesMutable().assign(newIndices);
+      });
+      return std::nullopt;
+    }
+
+    VectorType droppedDimsTy = VectorType::get(
+        vecTy.getShape().take_back(newIndices.size()), vecTy.getElementType(),
+        vecTy.getScalableDims().take_back(newIndices.size()));
+
+    IRMapping dropDimsMap;
+    for (Value arg : op->getOperands()) {
+      if (arg.getType() == vecTy) {
+        Value castArg = vector::ShapeCastOp::create(rewriter, arg.getLoc(),
+                                                    droppedDimsTy, arg);
+        dropDimsMap.map(arg, castArg);
+      }
+    }
+
+    // For operations with results (loads), clone with mapped operands and
+    // return a shape_cast back to the original type.
+    if (op->getNumResults() == 1) {
+      Operation *newOp = rewriter.clone(*op, dropDimsMap);
+      rewriter.modifyOpInPlace(newOp, [&]() {
+        auto concreteOp = cast<LoadStoreOp>(newOp);
+        concreteOp.getBaseMutable().assign(newMemref);
+        concreteOp.getIndicesMutable().assign(newIndices);
+        newOp->getResult(0).setType(droppedDimsTy);
+      });
+      Value castBack = ShapeCastOp::create(rewriter, newOp->getLoc(), vecTy,
+                                           newOp->getResult(0));
+      return {{castBack}};
+    }
+
+    // For operations without results (stores), modify in place with cast
+    // operands.
+    rewriter.modifyOpInPlace(op, [&]() {
+      auto concreteOp = cast<LoadStoreOp>(op);
+      concreteOp.getBaseMutable().assign(newMemref);
+      concreteOp.getIndicesMutable().assign(newIndices);
+      for (OpOperand &operand : op->getOpOperands()) {
+        if (Value replacement = dropDimsMap.lookupOrNull(operand.get()))
+          operand.set(replacement);
+      }
+    });
+    return std::nullopt;
+  }
+
+  // TODO: The various load and store operations (at the very least
+  // vector.load and vector.store) sholud be taught a `startsInbounds`
+  // attribute that would let us optimize index generation.
+  bool hasInboundsIndices(Operation *) const { return false; }
+};
+
+template <typename GatherScatterOp>
+struct GatherScatterLikeOpInterface final
+    : IndexedAccessOpInterface::ExternalModel<
+          GatherScatterLikeOpInterface<GatherScatterOp>, GatherScatterOp> {
+  TypedValue<MemRefType> getMemref(Operation *op) const {
+    return dyn_cast<TypedValue<MemRefType>>(
+        cast<GatherScatterOp>(op).getBase());
+  }
+
+  Operation::operand_range getIndices(Operation *op) const {
+    return cast<GatherScatterOp>(op).getOffsets();
+  }
+
+  // We assume that the index offset could point anywhere within a dimension,
+  // but that it won't meaningfully alias outside of it.
+  SmallVector<int64_t> getAccessedShape(Operation *op) const {
+    VectorType vecTy = cast<GatherScatterOp>(op).getVectorType();
+    return SmallVector<int64_t>(vecTy.getRank(), ShapedType::kDynamic);
+  }
+
+  std::optional<SmallVector<Value>>
+  updateMemrefAndIndices(Operation *op, RewriterBase &rewriter, Value 
newMemref,
+                         ValueRange newIndices) const {
+    rewriter.modifyOpInPlace(op, [&]() {
+      auto concreteOp = cast<GatherScatterOp>(op);
+      concreteOp.getBaseMutable().assign(newMemref);
+      concreteOp.getOffsetsMutable().assign(newIndices);
+    });
+    return std::nullopt;
+  }
+
+  bool hasInboundsIndices(Operation *) const { return false; }
+};
+} // namespace
+
+void mlir::vector::registerIndexedAccessOpInterfaceExternalModels(
+    DialectRegistry &registry) {
+  registry.addExtension(+[](MLIRContext *ctx, vector::VectorDialect *dialect) {
+#define LOADSTORELIKE(T)                                                       
\
+  T::attachInterface<VectorLoadStoreLikeOpInterface<T>>(*ctx)
+    LOADSTORELIKE(vector::LoadOp);
+    LOADSTORELIKE(vector::StoreOp);
+    LOADSTORELIKE(vector::MaskedLoadOp);
+    LOADSTORELIKE(vector::MaskedStoreOp);
+    LOADSTORELIKE(vector::ExpandLoadOp);
+    LOADSTORELIKE(vector::CompressStoreOp);
+#undef LOADSTORELIKE
+#define GATHERSCATTERLIKE(T)                                                   
\
+  T::attachInterface<GatherScatterLikeOpInterface<T>>(*ctx)
+    GATHERSCATTERLIKE(vector::GatherOp);
+    GATHERSCATTERLIKE(vector::ScatterOp);
+#undef GATHERSCATTERLIKE
+  });
+}
diff --git a/mlir/lib/RegisterAllDialects.cpp b/mlir/lib/RegisterAllDialects.cpp
index 258fed135a3e5..7171f8eff49b8 100644
--- a/mlir/lib/RegisterAllDialects.cpp
+++ b/mlir/lib/RegisterAllDialects.cpp
@@ -39,6 +39,7 @@
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/GPU/IR/ValueBoundsOpInterfaceImpl.h"
 #include "mlir/Dialect/GPU/Transforms/BufferDeallocationOpInterfaceImpl.h"
+#include "mlir/Dialect/GPU/Transforms/IndexedAccessOpInterfaceImpl.h"
 #include "mlir/Dialect/IRDL/IR/IRDL.h"
 #include "mlir/Dialect/Index/IR/IndexDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
@@ -60,6 +61,7 @@
 #include "mlir/Dialect/MemRef/Transforms/BufferViewFlowOpInterfaceImpl.h"
 #include "mlir/Dialect/MemRef/Transforms/RuntimeOpVerification.h"
 #include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
+#include "mlir/Dialect/NVGPU/Transforms/MemoryAccessOpInterfacesImpl.h"
 #include "mlir/Dialect/OpenACC/OpenACC.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "mlir/Dialect/PDL/IR/PDL.h"
@@ -94,6 +96,7 @@
 #include "mlir/Dialect/Vector/IR/ValueBoundsOpInterfaceImpl.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Dialect/Vector/Transforms/BufferizableOpInterfaceImpl.h"
+#include "mlir/Dialect/Vector/Transforms/IndexedAccessOpInterfaceImpl.h"
 #include "mlir/Dialect/Vector/Transforms/SubsetOpInterfaceImpl.h"
 #include "mlir/Dialect/WasmSSA/IR/WasmSSA.h"
 #include "mlir/Dialect/X86Vector/X86VectorDialect.h"
@@ -170,6 +173,7 @@ void mlir::registerAllDialects(DialectRegistry &registry) {
   cf::registerBufferizableOpInterfaceExternalModels(registry);
   cf::registerBufferDeallocationOpInterfaceExternalModels(registry);
   gpu::registerBufferDeallocationOpInterfaceExternalModels(registry);
+  gpu::registerIndexedAccessOpInterfaceExternalModels(registry);
   gpu::registerValueBoundsOpInterfaceExternalModels(registry);
   LLVM::registerInlinerInterface(registry);
   NVVM::registerInlinerInterface(registry);
@@ -181,6 +185,7 @@ void mlir::registerAllDialects(DialectRegistry &registry) {
   memref::registerValueBoundsOpInterfaceExternalModels(registry);
   memref::registerMemorySlotExternalModels(registry);
   ml_program::registerBufferizableOpInterfaceExternalModels(registry);
+  nvgpu::registerMemoryAccessOpInterfacesExternalModels(registry);
   scf::registerBufferDeallocationOpInterfaceExternalModels(registry);
   scf::registerBufferizableOpInterfaceExternalModels(registry);
   scf::registerValueBoundsOpInterfaceExternalModels(registry);
@@ -195,6 +200,7 @@ void mlir::registerAllDialects(DialectRegistry &registry) {
   tensor::registerValueBoundsOpInterfaceExternalModels(registry);
   tosa::registerShardingInterfaceExternalModels(registry);
   vector::registerBufferizableOpInterfaceExternalModels(registry);
+  vector::registerIndexedAccessOpInterfaceExternalModels(registry);
   vector::registerSubsetOpInterfaceExternalModels(registry);
   vector::registerValueBoundsOpInterfaceExternalModels(registry);
   NVVM::registerNVVMTargetInterfaceExternalModels(registry);

_______________________________________________
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [mlir] [mlir] Implement indexed access op interfaces for memref, vector, gpu, nvgpu (PR #177014)

Reply via email to