[llvm-branch-commits] [mlir] [MLIR][OpenMP] Support allocations of device shared memory (PR #150924)

2026-04-27 Thread Sergio Afonso via llvm-branch-commits

https://github.com/skatrak updated 
https://github.com/llvm/llvm-project/pull/150924

>From a1edbc8431cff5ac28dbf4a1b5cae5e9e9ab7dac Mon Sep 17 00:00:00 2001
From: Sergio Afonso 
Date: Fri, 27 Jun 2025 13:29:13 +0100
Subject: [PATCH] [MLIR][OpenMP] Support allocations of device shared memory

This patch updates the allocation of some reduction and private variables
within target regions to use device shared memory rather than private memory.
This is a prerequisite to produce working Generic kernels containing parallel
regions.

In particular, the following situations result in the usage of device shared
memory (only when compiling for the target device if they are placed inside of
a target region representing a Generic kernel):
- Reduction variables on `teams` constructs.
- Private variables on `teams` and `distribute` constructs that are reduced or
used inside of a `parallel` region.

Currently, there is no support for delayed privatization on `teams` constructs,
so private variables on these constructs won't currently be affected. When
support is added, if it uses the existing `allocatePrivateVars` and
`cleanupPrivateVars` functions, usage of device shared memory will be
introduced automatically.
---
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp  | 231 +-
 .../omptarget-device-shared-memory.mlir   |  81 ++
 2 files changed, 249 insertions(+), 63 deletions(-)
 create mode 100644 mlir/test/Target/LLVMIR/omptarget-device-shared-memory.mlir

diff --git 
a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp 
b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 1e944ac93e6f1..896e0b62d9821 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -,12 +,63 @@ struct DeferredStore {
 };
 } // namespace
 
+/// Check whether allocations for the given operation might potentially have to
+/// be done in device shared memory. That means we're compiling for a 
offloading
+/// target, the operation is an `omp::TargetOp` or nested inside of one and 
that
+/// target region represents a Generic (non-SPMD) kernel.
+///
+/// This represents a necessary but not sufficient set of conditions to use
+/// device shared memory in place of regular allocas. For some variables, the
+/// associated OpenMP construct or their uses might also need to be taken into
+/// account.
+static bool
+mightAllocInDeviceSharedMemory(Operation &op,
+   const llvm::OpenMPIRBuilder &ompBuilder) {
+  if (!ompBuilder.Config.isTargetDevice())
+return false;
+
+  auto targetOp = dyn_cast(op);
+  if (!targetOp)
+targetOp = op.getParentOfType();
+
+  return targetOp &&
+ targetOp.getKernelExecFlags(targetOp.getInnermostCapturedOmpOp()) ==
+ omp::TargetExecMode::generic;
+}
+
+/// Check whether the entry block argument representing the private copy of a
+/// variable in an OpenMP construct must be allocated in device shared memory,
+/// based on what the uses of that copy are.
+///
+/// This must only be called if a previous call to
+/// \c mightAllocInDeviceSharedMemory has already returned \c true for the
+/// operation that owns the specified block argument.
+static bool mustAllocPrivateVarInDeviceSharedMemory(BlockArgument value) {
+  Operation *parentOp = value.getOwner()->getParentOp();
+  auto targetOp = dyn_cast(parentOp);
+  if (!targetOp)
+targetOp = parentOp->getParentOfType();
+  assert(targetOp && "expected a parent omp.target operation");
+
+  for (auto *user : value.getUsers()) {
+if (auto parallelOp = dyn_cast(user)) {
+  if (llvm::is_contained(parallelOp.getReductionVars(), value))
+return true;
+} else if (auto parallelOp = user->getParentOfType()) {
+  if (parentOp->isProperAncestor(parallelOp))
+return true;
+}
+  }
+
+  return false;
+}
+
 /// Allocate space for privatized reduction variables.
 /// `deferredStores` contains information to create store operations which 
needs
 /// to be inserted after all allocas
 template 
 static LogicalResult
-allocReductionVars(T loop, ArrayRef reductionArgs,
+allocReductionVars(T op, ArrayRef reductionArgs,
llvm::IRBuilderBase &builder,
LLVM::ModuleTranslation &moduleTranslation,
const llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
@@ -1128,10 +1179,14 @@ allocReductionVars(T loop, ArrayRef 
reductionArgs,
   llvm::IRBuilderBase::InsertPointGuard guard(builder);
   builder.SetInsertPoint(allocaIP.getBlock()->getTerminator());
 
+  llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
+  bool useDeviceSharedMem =
+  isa(op) && mightAllocInDeviceSharedMemory(*op, 
*ompBuilder);
+
   // delay creating stores until after all allocas
-  deferredStores.reserve(loop.getNumReductionVars());
+  deferredStores.reserve(op.getNumReductionVars());
 
-  for (std::s

[llvm-branch-commits] [mlir] [MLIR][OpenMP] Support allocations of device shared memory (PR #150924)

2026-04-20 Thread Sergio Afonso via llvm-branch-commits

https://github.com/skatrak updated 
https://github.com/llvm/llvm-project/pull/150924

>From f35234dfd0b134557c1a3f47581044cadacc5a72 Mon Sep 17 00:00:00 2001
From: Sergio Afonso 
Date: Fri, 27 Jun 2025 13:29:13 +0100
Subject: [PATCH] [MLIR][OpenMP] Support allocations of device shared memory

This patch updates the allocation of some reduction and private variables
within target regions to use device shared memory rather than private memory.
This is a prerequisite to produce working Generic kernels containing parallel
regions.

In particular, the following situations result in the usage of device shared
memory (only when compiling for the target device if they are placed inside of
a target region representing a Generic kernel):
- Reduction variables on `teams` constructs.
- Private variables on `teams` and `distribute` constructs that are reduced or
used inside of a `parallel` region.

Currently, there is no support for delayed privatization on `teams` constructs,
so private variables on these constructs won't currently be affected. When
support is added, if it uses the existing `allocatePrivateVars` and
`cleanupPrivateVars` functions, usage of device shared memory will be
introduced automatically.
---
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp  | 231 +-
 .../omptarget-device-shared-memory.mlir   |  81 ++
 2 files changed, 249 insertions(+), 63 deletions(-)
 create mode 100644 mlir/test/Target/LLVMIR/omptarget-device-shared-memory.mlir

diff --git 
a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp 
b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 8e5453ce48297..2a0d3d8100d36 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -,12 +,63 @@ struct DeferredStore {
 };
 } // namespace
 
+/// Check whether allocations for the given operation might potentially have to
+/// be done in device shared memory. That means we're compiling for a 
offloading
+/// target, the operation is an `omp::TargetOp` or nested inside of one and 
that
+/// target region represents a Generic (non-SPMD) kernel.
+///
+/// This represents a necessary but not sufficient set of conditions to use
+/// device shared memory in place of regular allocas. For some variables, the
+/// associated OpenMP construct or their uses might also need to be taken into
+/// account.
+static bool
+mightAllocInDeviceSharedMemory(Operation &op,
+   const llvm::OpenMPIRBuilder &ompBuilder) {
+  if (!ompBuilder.Config.isTargetDevice())
+return false;
+
+  auto targetOp = dyn_cast(op);
+  if (!targetOp)
+targetOp = op.getParentOfType();
+
+  return targetOp &&
+ targetOp.getKernelExecFlags(targetOp.getInnermostCapturedOmpOp()) ==
+ omp::TargetExecMode::generic;
+}
+
+/// Check whether the entry block argument representing the private copy of a
+/// variable in an OpenMP construct must be allocated in device shared memory,
+/// based on what the uses of that copy are.
+///
+/// This must only be called if a previous call to
+/// \c mightAllocInDeviceSharedMemory has already returned \c true for the
+/// operation that owns the specified block argument.
+static bool mustAllocPrivateVarInDeviceSharedMemory(BlockArgument value) {
+  Operation *parentOp = value.getOwner()->getParentOp();
+  auto targetOp = dyn_cast(parentOp);
+  if (!targetOp)
+targetOp = parentOp->getParentOfType();
+  assert(targetOp && "expected a parent omp.target operation");
+
+  for (auto *user : value.getUsers()) {
+if (auto parallelOp = dyn_cast(user)) {
+  if (llvm::is_contained(parallelOp.getReductionVars(), value))
+return true;
+} else if (auto parallelOp = user->getParentOfType()) {
+  if (parentOp->isProperAncestor(parallelOp))
+return true;
+}
+  }
+
+  return false;
+}
+
 /// Allocate space for privatized reduction variables.
 /// `deferredStores` contains information to create store operations which 
needs
 /// to be inserted after all allocas
 template 
 static LogicalResult
-allocReductionVars(T loop, ArrayRef reductionArgs,
+allocReductionVars(T op, ArrayRef reductionArgs,
llvm::IRBuilderBase &builder,
LLVM::ModuleTranslation &moduleTranslation,
const llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
@@ -1128,10 +1179,14 @@ allocReductionVars(T loop, ArrayRef 
reductionArgs,
   llvm::IRBuilderBase::InsertPointGuard guard(builder);
   builder.SetInsertPoint(allocaIP.getBlock()->getTerminator());
 
+  llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
+  bool useDeviceSharedMem =
+  isa(op) && mightAllocInDeviceSharedMemory(*op, 
*ompBuilder);
+
   // delay creating stores until after all allocas
-  deferredStores.reserve(loop.getNumReductionVars());
+  deferredStores.reserve(op.getNumReductionVars());
 
-  for (std::s

[llvm-branch-commits] [mlir] [MLIR][OpenMP] Support allocations of device shared memory (PR #150924)

2026-04-15 Thread Sergio Afonso via llvm-branch-commits

https://github.com/skatrak updated 
https://github.com/llvm/llvm-project/pull/150924

>From d7d85d9cbb67e48cc2ba4ee6e1023d76288e5a69 Mon Sep 17 00:00:00 2001
From: Sergio Afonso 
Date: Fri, 27 Jun 2025 13:29:13 +0100
Subject: [PATCH 1/4] [MLIR][OpenMP] Support allocations of device shared
 memory

This patch updates the allocation of some reduction and private variables
within target regions to use device shared memory rather than private memory.
This is a prerequisite to produce working Generic kernels containing parallel
regions.

In particular, the following situations result in the usage of device shared
memory (only when compiling for the target device if they are placed inside of
a target region representing a Generic kernel):
- Reduction variables on `teams` constructs.
- Private variables on `teams` and `distribute` constructs that are reduced or
used inside of a `parallel` region.

Currently, there is no support for delayed privatization on `teams` constructs,
so private variables on these constructs won't currently be affected. When
support is added, if it uses the existing `allocatePrivateVars` and
`cleanupPrivateVars` functions, usage of device shared memory will be
introduced automatically.
---
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp  | 227 +-
 .../omptarget-device-shared-memory.mlir   |  86 +++
 2 files changed, 253 insertions(+), 60 deletions(-)
 create mode 100644 mlir/test/Target/LLVMIR/omptarget-device-shared-memory.mlir

diff --git 
a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp 
b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 7765df19f5f40..6308b0445d0d0 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -,12 +,64 @@ struct DeferredStore {
 };
 } // namespace
 
+/// Check whether allocations for the given operation might potentially have to
+/// be done in device shared memory. That means we're compiling for a 
offloading
+/// target, the operation is an `omp::TargetOp` or nested inside of one and 
that
+/// target region represents a Generic (non-SPMD) kernel.
+///
+/// This represents a necessary but not sufficient set of conditions to use
+/// device shared memory in place of regular allocas. Depending on the 
variable,
+/// its uses or the associated OpenMP construct might also need to be taken 
into
+/// account.
+static bool
+mightAllocInDeviceSharedMemory(Operation &op,
+   const llvm::OpenMPIRBuilder &ompBuilder) {
+  if (!ompBuilder.Config.isTargetDevice())
+return false;
+
+  auto targetOp = dyn_cast(op);
+  if (!targetOp)
+targetOp = op.getParentOfType();
+
+  return targetOp &&
+ !bitEnumContainsAny(
+ targetOp.getKernelExecFlags(targetOp.getInnermostCapturedOmpOp()),
+ omp::TargetRegionFlags::spmd);
+}
+
+/// Check whether the entry block argument representing the private copy of a
+/// variable in an OpenMP construct must be allocated in device shared memory,
+/// based on what the uses of that copy are.
+///
+/// This must only be called if a previous call to
+/// \c mightAllocInDeviceSharedMemory has already returned \c true for the
+/// operation that owns the specified block argument.
+static bool mustAllocPrivateVarInDeviceSharedMemory(BlockArgument value) {
+  Operation *parentOp = value.getOwner()->getParentOp();
+  auto targetOp = dyn_cast(parentOp);
+  if (!targetOp)
+targetOp = parentOp->getParentOfType();
+  assert(targetOp && "expected a parent omp.target operation");
+
+  for (auto *user : value.getUsers()) {
+if (auto parallelOp = dyn_cast(user)) {
+  if (llvm::is_contained(parallelOp.getReductionVars(), value))
+return true;
+} else if (auto parallelOp = user->getParentOfType()) {
+  if (targetOp->isProperAncestor(parallelOp))
+return true;
+}
+  }
+
+  return false;
+}
+
 /// Allocate space for privatized reduction variables.
 /// `deferredStores` contains information to create store operations which 
needs
 /// to be inserted after all allocas
 template 
 static LogicalResult
-allocReductionVars(T loop, ArrayRef reductionArgs,
+allocReductionVars(T op, ArrayRef reductionArgs,
llvm::IRBuilderBase &builder,
LLVM::ModuleTranslation &moduleTranslation,
const llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
@@ -1128,10 +1180,14 @@ allocReductionVars(T loop, ArrayRef 
reductionArgs,
   llvm::IRBuilderBase::InsertPointGuard guard(builder);
   builder.SetInsertPoint(allocaIP.getBlock()->getTerminator());
 
+  llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
+  bool useDeviceSharedMem =
+  isa(op) && mightAllocInDeviceSharedMemory(*op, 
*ompBuilder);
+
   // delay creating stores until after all allocas
-  deferredStores.reserve(loop.getNumReductionVars());
+  deferredStores.rese

[llvm-branch-commits] [mlir] [MLIR][OpenMP] Support allocations of device shared memory (PR #150924)

2026-02-23 Thread Sergio Afonso via llvm-branch-commits

https://github.com/skatrak updated 
https://github.com/llvm/llvm-project/pull/150924

>From bb9338a56d7f08ac5d2940c064fad8e2c4d0b6bb Mon Sep 17 00:00:00 2001
From: Sergio Afonso 
Date: Fri, 27 Jun 2025 13:29:13 +0100
Subject: [PATCH 1/3] [MLIR][OpenMP] Support allocations of device shared
 memory

This patch updates the allocation of some reduction and private variables
within target regions to use device shared memory rather than private memory.
This is a prerequisite to produce working Generic kernels containing parallel
regions.

In particular, the following situations result in the usage of device shared
memory (only when compiling for the target device if they are placed inside of
a target region representing a Generic kernel):
- Reduction variables on `teams` constructs.
- Private variables on `teams` and `distribute` constructs that are reduced or
used inside of a `parallel` region.

Currently, there is no support for delayed privatization on `teams` constructs,
so private variables on these constructs won't currently be affected. When
support is added, if it uses the existing `allocatePrivateVars` and
`cleanupPrivateVars` functions, usage of device shared memory will be
introduced automatically.
---
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp  | 227 +-
 .../omptarget-device-shared-memory.mlir   |  86 +++
 2 files changed, 253 insertions(+), 60 deletions(-)
 create mode 100644 mlir/test/Target/LLVMIR/omptarget-device-shared-memory.mlir

diff --git 
a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp 
b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 444cced73115f..47a5472bceeca 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -1120,12 +1120,64 @@ struct DeferredStore {
 };
 } // namespace
 
+/// Check whether allocations for the given operation might potentially have to
+/// be done in device shared memory. That means we're compiling for a 
offloading
+/// target, the operation is an `omp::TargetOp` or nested inside of one and 
that
+/// target region represents a Generic (non-SPMD) kernel.
+///
+/// This represents a necessary but not sufficient set of conditions to use
+/// device shared memory in place of regular allocas. Depending on the 
variable,
+/// its uses or the associated OpenMP construct might also need to be taken 
into
+/// account.
+static bool
+mightAllocInDeviceSharedMemory(Operation &op,
+   const llvm::OpenMPIRBuilder &ompBuilder) {
+  if (!ompBuilder.Config.isTargetDevice())
+return false;
+
+  auto targetOp = dyn_cast(op);
+  if (!targetOp)
+targetOp = op.getParentOfType();
+
+  return targetOp &&
+ !bitEnumContainsAny(
+ targetOp.getKernelExecFlags(targetOp.getInnermostCapturedOmpOp()),
+ omp::TargetRegionFlags::spmd);
+}
+
+/// Check whether the entry block argument representing the private copy of a
+/// variable in an OpenMP construct must be allocated in device shared memory,
+/// based on what the uses of that copy are.
+///
+/// This must only be called if a previous call to
+/// \c mightAllocInDeviceSharedMemory has already returned \c true for the
+/// operation that owns the specified block argument.
+static bool mustAllocPrivateVarInDeviceSharedMemory(BlockArgument value) {
+  Operation *parentOp = value.getOwner()->getParentOp();
+  auto targetOp = dyn_cast(parentOp);
+  if (!targetOp)
+targetOp = parentOp->getParentOfType();
+  assert(targetOp && "expected a parent omp.target operation");
+
+  for (auto *user : value.getUsers()) {
+if (auto parallelOp = dyn_cast(user)) {
+  if (llvm::is_contained(parallelOp.getReductionVars(), value))
+return true;
+} else if (auto parallelOp = user->getParentOfType()) {
+  if (targetOp->isProperAncestor(parallelOp))
+return true;
+}
+  }
+
+  return false;
+}
+
 /// Allocate space for privatized reduction variables.
 /// `deferredStores` contains information to create store operations which 
needs
 /// to be inserted after all allocas
 template 
 static LogicalResult
-allocReductionVars(T loop, ArrayRef reductionArgs,
+allocReductionVars(T op, ArrayRef reductionArgs,
llvm::IRBuilderBase &builder,
LLVM::ModuleTranslation &moduleTranslation,
const llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
@@ -1137,10 +1189,14 @@ allocReductionVars(T loop, ArrayRef 
reductionArgs,
   llvm::IRBuilderBase::InsertPointGuard guard(builder);
   builder.SetInsertPoint(allocaIP.getBlock()->getTerminator());
 
+  llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
+  bool useDeviceSharedMem =
+  isa(op) && mightAllocInDeviceSharedMemory(*op, 
*ompBuilder);
+
   // delay creating stores until after all allocas
-  deferredStores.reserve(loop.getNumReductionVars());
+  deferredStores.rese

[llvm-branch-commits] [mlir] [MLIR][OpenMP] Support allocations of device shared memory (PR #150924)

2026-01-26 Thread Tom Eccles via llvm-branch-commits

https://github.com/tblah approved this pull request.

LGTM

https://github.com/llvm/llvm-project/pull/150924
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [mlir] [MLIR][OpenMP] Support allocations of device shared memory (PR #150924)

2026-01-23 Thread Sergio Afonso via llvm-branch-commits

https://github.com/skatrak updated 
https://github.com/llvm/llvm-project/pull/150924

>From 245c2508f5a9272bda20aa4d320102bd73353a9b Mon Sep 17 00:00:00 2001
From: Sergio Afonso 
Date: Fri, 27 Jun 2025 13:29:13 +0100
Subject: [PATCH 1/3] [MLIR][OpenMP] Support allocations of device shared
 memory

This patch updates the allocation of some reduction and private variables
within target regions to use device shared memory rather than private memory.
This is a prerequisite to produce working Generic kernels containing parallel
regions.

In particular, the following situations result in the usage of device shared
memory (only when compiling for the target device if they are placed inside of
a target region representing a Generic kernel):
- Reduction variables on `teams` constructs.
- Private variables on `teams` and `distribute` constructs that are reduced or
used inside of a `parallel` region.

Currently, there is no support for delayed privatization on `teams` constructs,
so private variables on these constructs won't currently be affected. When
support is added, if it uses the existing `allocatePrivateVars` and
`cleanupPrivateVars` functions, usage of device shared memory will be
introduced automatically.
---
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp  | 227 +-
 .../omptarget-device-shared-memory.mlir   |  86 +++
 2 files changed, 253 insertions(+), 60 deletions(-)
 create mode 100644 mlir/test/Target/LLVMIR/omptarget-device-shared-memory.mlir

diff --git 
a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp 
b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index bdb711c49382d..318990177d0d9 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -1120,12 +1120,64 @@ struct DeferredStore {
 };
 } // namespace
 
+/// Check whether allocations for the given operation might potentially have to
+/// be done in device shared memory. That means we're compiling for a 
offloading
+/// target, the operation is an `omp::TargetOp` or nested inside of one and 
that
+/// target region represents a Generic (non-SPMD) kernel.
+///
+/// This represents a necessary but not sufficient set of conditions to use
+/// device shared memory in place of regular allocas. Depending on the 
variable,
+/// its uses or the associated OpenMP construct might also need to be taken 
into
+/// account.
+static bool
+mightAllocInDeviceSharedMemory(Operation &op,
+   const llvm::OpenMPIRBuilder &ompBuilder) {
+  if (!ompBuilder.Config.isTargetDevice())
+return false;
+
+  auto targetOp = dyn_cast(op);
+  if (!targetOp)
+targetOp = op.getParentOfType();
+
+  return targetOp &&
+ !bitEnumContainsAny(
+ targetOp.getKernelExecFlags(targetOp.getInnermostCapturedOmpOp()),
+ omp::TargetRegionFlags::spmd);
+}
+
+/// Check whether the entry block argument representing the private copy of a
+/// variable in an OpenMP construct must be allocated in device shared memory,
+/// based on what the uses of that copy are.
+///
+/// This must only be called if a previous call to
+/// \c mightAllocInDeviceSharedMemory has already returned \c true for the
+/// operation that owns the specified block argument.
+static bool mustAllocPrivateVarInDeviceSharedMemory(BlockArgument value) {
+  Operation *parentOp = value.getOwner()->getParentOp();
+  auto targetOp = dyn_cast(parentOp);
+  if (!targetOp)
+targetOp = parentOp->getParentOfType();
+  assert(targetOp && "expected a parent omp.target operation");
+
+  for (auto *user : value.getUsers()) {
+if (auto parallelOp = dyn_cast(user)) {
+  if (llvm::is_contained(parallelOp.getReductionVars(), value))
+return true;
+} else if (auto parallelOp = user->getParentOfType()) {
+  if (targetOp->isProperAncestor(parallelOp))
+return true;
+}
+  }
+
+  return false;
+}
+
 /// Allocate space for privatized reduction variables.
 /// `deferredStores` contains information to create store operations which 
needs
 /// to be inserted after all allocas
 template 
 static LogicalResult
-allocReductionVars(T loop, ArrayRef reductionArgs,
+allocReductionVars(T op, ArrayRef reductionArgs,
llvm::IRBuilderBase &builder,
LLVM::ModuleTranslation &moduleTranslation,
const llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
@@ -1137,10 +1189,14 @@ allocReductionVars(T loop, ArrayRef 
reductionArgs,
   llvm::IRBuilderBase::InsertPointGuard guard(builder);
   builder.SetInsertPoint(allocaIP.getBlock()->getTerminator());
 
+  llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
+  bool useDeviceSharedMem =
+  isa(op) && mightAllocInDeviceSharedMemory(*op, 
*ompBuilder);
+
   // delay creating stores until after all allocas
-  deferredStores.reserve(loop.getNumReductionVars());
+  deferredStores.rese

[llvm-branch-commits] [mlir] [MLIR][OpenMP] Support allocations of device shared memory (PR #150924)

2025-10-18 Thread Sergio Afonso via llvm-branch-commits

https://github.com/skatrak updated 
https://github.com/llvm/llvm-project/pull/150924

>From d839bbddbd70403653e45f69a221d391818e3e82 Mon Sep 17 00:00:00 2001
From: Sergio Afonso 
Date: Fri, 27 Jun 2025 13:29:13 +0100
Subject: [PATCH 1/2] [MLIR][OpenMP] Support allocations of device shared
 memory

This patch updates the allocation of some reduction and private variables
within target regions to use device shared memory rather than private memory.
This is a prerequisite to produce working Generic kernels containing parallel
regions.

In particular, the following situations result in the usage of device shared
memory (only when compiling for the target device if they are placed inside of
a target region representing a Generic kernel):
- Reduction variables on `teams` constructs.
- Private variables on `teams` and `distribute` constructs that are reduced or
used inside of a `parallel` region.

Currently, there is no support for delayed privatization on `teams` constructs,
so private variables on these constructs won't currently be affected. When
support is added, if it uses the existing `allocatePrivateVars` and
`cleanupPrivateVars` functions, usage of device shared memory will be
introduced automatically.
---
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp  | 227 +-
 .../omptarget-device-shared-memory.mlir   |  86 +++
 2 files changed, 253 insertions(+), 60 deletions(-)
 create mode 100644 mlir/test/Target/LLVMIR/omptarget-device-shared-memory.mlir

diff --git 
a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp 
b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 172029196905d..360e7569bb261 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -1103,12 +1103,64 @@ struct DeferredStore {
 };
 } // namespace
 
+/// Check whether allocations for the given operation might potentially have to
+/// be done in device shared memory. That means we're compiling for a 
offloading
+/// target, the operation is an `omp::TargetOp` or nested inside of one and 
that
+/// target region represents a Generic (non-SPMD) kernel.
+///
+/// This represents a necessary but not sufficient set of conditions to use
+/// device shared memory in place of regular allocas. Depending on the 
variable,
+/// its uses or the associated OpenMP construct might also need to be taken 
into
+/// account.
+static bool
+mightAllocInDeviceSharedMemory(Operation &op,
+   const llvm::OpenMPIRBuilder &ompBuilder) {
+  if (!ompBuilder.Config.isTargetDevice())
+return false;
+
+  auto targetOp = dyn_cast(op);
+  if (!targetOp)
+targetOp = op.getParentOfType();
+
+  return targetOp &&
+ !bitEnumContainsAny(
+ targetOp.getKernelExecFlags(targetOp.getInnermostCapturedOmpOp()),
+ omp::TargetRegionFlags::spmd);
+}
+
+/// Check whether the entry block argument representing the private copy of a
+/// variable in an OpenMP construct must be allocated in device shared memory,
+/// based on what the uses of that copy are.
+///
+/// This must only be called if a previous call to
+/// \c mightAllocInDeviceSharedMemory has already returned \c true for the
+/// operation that owns the specified block argument.
+static bool mustAllocPrivateVarInDeviceSharedMemory(BlockArgument value) {
+  Operation *parentOp = value.getOwner()->getParentOp();
+  auto targetOp = dyn_cast(parentOp);
+  if (!targetOp)
+targetOp = parentOp->getParentOfType();
+  assert(targetOp && "expected a parent omp.target operation");
+
+  for (auto *user : value.getUsers()) {
+if (auto parallelOp = dyn_cast(user)) {
+  if (llvm::is_contained(parallelOp.getReductionVars(), value))
+return true;
+} else if (auto parallelOp = user->getParentOfType()) {
+  if (targetOp->isProperAncestor(parallelOp))
+return true;
+}
+  }
+
+  return false;
+}
+
 /// Allocate space for privatized reduction variables.
 /// `deferredStores` contains information to create store operations which 
needs
 /// to be inserted after all allocas
 template 
 static LogicalResult
-allocReductionVars(T loop, ArrayRef reductionArgs,
+allocReductionVars(T op, ArrayRef reductionArgs,
llvm::IRBuilderBase &builder,
LLVM::ModuleTranslation &moduleTranslation,
const llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
@@ -1120,10 +1172,14 @@ allocReductionVars(T loop, ArrayRef 
reductionArgs,
   llvm::IRBuilderBase::InsertPointGuard guard(builder);
   builder.SetInsertPoint(allocaIP.getBlock()->getTerminator());
 
+  llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
+  bool useDeviceSharedMem =
+  isa(op) && mightAllocInDeviceSharedMemory(*op, 
*ompBuilder);
+
   // delay creating stores until after all allocas
-  deferredStores.reserve(loop.getNumReductionVars());
+  deferredStores.rese

[llvm-branch-commits] [mlir] [MLIR][OpenMP] Support allocations of device shared memory (PR #150924)

2025-08-06 Thread Pranav Bhandarkar via llvm-branch-commits

https://github.com/bhandarkar-pranav edited 
https://github.com/llvm/llvm-project/pull/150924
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [mlir] [MLIR][OpenMP] Support allocations of device shared memory (PR #150924)

2025-08-06 Thread Pranav Bhandarkar via llvm-branch-commits

https://github.com/bhandarkar-pranav commented:

Thanks for this PR, @skatrak. One nit and one clarifying question on my part.

https://github.com/llvm/llvm-project/pull/150924
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [mlir] [MLIR][OpenMP] Support allocations of device shared memory (PR #150924)

2025-08-06 Thread Pranav Bhandarkar via llvm-branch-commits


@@ -1102,12 +1102,64 @@ struct DeferredStore {
 };
 } // namespace
 
+/// Check whether allocations for the given operation might potentially have to
+/// be done in device shared memory. That means we're compiling for a 
offloading
+/// target, the operation is an `omp::TargetOp` or nested inside of one and 
that
+/// target region represents a Generic (non-SPMD) kernel.
+///
+/// This represents a necessary but not sufficient set of conditions to use
+/// device shared memory in place of regular allocas. Depending on the 
variable,
+/// its uses or the associated OpenMP construct might also need to be taken 
into

bhandarkar-pranav wrote:

  - The 'or' in this sentence seems to be off. Did you mean "Depending on 
the variable and its uses, the associated OpenMP construct might need to be 
taken.."?

https://github.com/llvm/llvm-project/pull/150924
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [mlir] [MLIR][OpenMP] Support allocations of device shared memory (PR #150924)

2025-08-06 Thread Pranav Bhandarkar via llvm-branch-commits


@@ -1102,12 +1102,64 @@ struct DeferredStore {
 };
 } // namespace
 
+/// Check whether allocations for the given operation might potentially have to
+/// be done in device shared memory. That means we're compiling for a 
offloading
+/// target, the operation is an `omp::TargetOp` or nested inside of one and 
that
+/// target region represents a Generic (non-SPMD) kernel.
+///
+/// This represents a necessary but not sufficient set of conditions to use
+/// device shared memory in place of regular allocas. Depending on the 
variable,
+/// its uses or the associated OpenMP construct might also need to be taken 
into
+/// account.
+static bool
+mightAllocInDeviceSharedMemory(Operation &op,
+   const llvm::OpenMPIRBuilder &ompBuilder) {
+  if (!ompBuilder.Config.isTargetDevice())
+return false;
+
+  auto targetOp = dyn_cast(op);
+  if (!targetOp)
+targetOp = op.getParentOfType();
+
+  return targetOp &&
+ !bitEnumContainsAny(
+ targetOp.getKernelExecFlags(targetOp.getInnermostCapturedOmpOp()),
+ omp::TargetRegionFlags::spmd);
+}
+
+/// Check whether the entry block argument representing the private copy of a
+/// variable in an OpenMP construct must be allocated in device shared memory,
+/// based on what the uses of that copy are.
+///
+/// This must only be called if a previous call to
+/// \c mightAllocInDeviceSharedMemory has already returned \c true for the
+/// operation that owns the specified block argument.
+static bool mustAllocPrivateVarInDeviceSharedMemory(BlockArgument value) {
+  Operation *parentOp = value.getOwner()->getParentOp();
+  auto targetOp = dyn_cast(parentOp);
+  if (!targetOp)
+targetOp = parentOp->getParentOfType();
+  assert(targetOp && "expected a parent omp.target operation");
+
+  for (auto *user : value.getUsers()) {
+if (auto parallelOp = dyn_cast(user)) {
+  if (llvm::is_contained(parallelOp.getReductionVars(), value))
+return true;
+} else if (auto parallelOp = user->getParentOfType()) {
+  if (targetOp->isProperAncestor(parallelOp))

bhandarkar-pranav wrote:

There seems to be a hole in my understanding of this.  At this point in the 
code, we know that `value` iis a `BlockArgument`. We know that it has an an 
ancestor, `targetOp` that is an `omp::TargetOp`. Should't all the users of a 
`BlockArgument` be such that they are dominated by the `BlockArgument`. 
Therefore `targetOp` should trivially be an ancestor of all users, no? All this 
is to say that the use of a `BlockArgument` inside a `omp::ParallelOp` should 
be enough and this ancestor check is superfluous. Unless, of course, I am 
missing something that is obvious.

https://github.com/llvm/llvm-project/pull/150924
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [mlir] [MLIR][OpenMP] Support allocations of device shared memory (PR #150924)

2025-07-30 Thread Michael Kruse via llvm-branch-commits


@@ -1102,12 +1102,64 @@ struct DeferredStore {
 };
 } // namespace
 
+/// Check whether allocations for the given operation might potentially have to
+/// be done in device shared memory. That means we're compiling for a 
offloading
+/// target, the operation is an `omp::TargetOp` or nested inside of one and 
that
+/// target region represents a Generic (non-SPMD) kernel.
+///
+/// This represents a necessary but not sufficient set of conditions to use
+/// device shared memory in place of regular allocas. Depending on the 
variable,
+/// its uses or the associated OpenMP construct might also need to be taken 
into
+/// account.
+static bool
+mightAllocInDeviceSharedMemory(Operation &op,
+   const llvm::OpenMPIRBuilder &ompBuilder) {
+  if (!ompBuilder.Config.isTargetDevice())
+return false;
+
+  auto targetOp = dyn_cast(op);
+  if (!targetOp)
+targetOp = op.getParentOfType();
+
+  return targetOp &&
+ !bitEnumContainsAny(
+ targetOp.getKernelExecFlags(targetOp.getInnermostCapturedOmpOp()),
+ omp::TargetRegionFlags::spmd);
+}
+
+/// Check whether the entry block argument representing the private copy of a
+/// variable in an OpenMP construct must be allocated in device shared memory,
+/// based on what the uses of that copy are.
+///
+/// This must only be called if a previous call to
+/// \c mightAllocInDeviceSharedMemory has already returned \c true for the
+/// operation that owns the specified block argument.
+static bool mustAllocPrivateVarInDeviceSharedMemory(BlockArgument value) {
+  Operation *parentOp = value.getOwner()->getParentOp();
+  auto targetOp = dyn_cast(parentOp);

Meinersbur wrote:

```suggestion
  auto *targetOp = dyn_cast(parentOp);
```

https://github.com/llvm/llvm-project/pull/150924
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [mlir] [MLIR][OpenMP] Support allocations of device shared memory (PR #150924)

2025-07-30 Thread Michael Kruse via llvm-branch-commits


@@ -1102,12 +1102,64 @@ struct DeferredStore {
 };
 } // namespace
 
+/// Check whether allocations for the given operation might potentially have to
+/// be done in device shared memory. That means we're compiling for a 
offloading
+/// target, the operation is an `omp::TargetOp` or nested inside of one and 
that
+/// target region represents a Generic (non-SPMD) kernel.
+///
+/// This represents a necessary but not sufficient set of conditions to use
+/// device shared memory in place of regular allocas. Depending on the 
variable,
+/// its uses or the associated OpenMP construct might also need to be taken 
into
+/// account.
+static bool
+mightAllocInDeviceSharedMemory(Operation &op,
+   const llvm::OpenMPIRBuilder &ompBuilder) {
+  if (!ompBuilder.Config.isTargetDevice())
+return false;
+
+  auto targetOp = dyn_cast(op);

Meinersbur wrote:

```suggestion
  auto *targetOp = dyn_cast(op);
```

https://github.com/llvm/llvm-project/pull/150924
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [mlir] [MLIR][OpenMP] Support allocations of device shared memory (PR #150924)

2025-07-30 Thread Michael Kruse via llvm-branch-commits

https://github.com/Meinersbur approved this pull request.

LGTM

https://github.com/llvm/llvm-project/pull/150924
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [mlir] [MLIR][OpenMP] Support allocations of device shared memory (PR #150924)

2025-07-30 Thread Michael Kruse via llvm-branch-commits

https://github.com/Meinersbur edited 
https://github.com/llvm/llvm-project/pull/150924
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [mlir] [MLIR][OpenMP] Support allocations of device shared memory (PR #150924)

2025-07-30 Thread Michael Kruse via llvm-branch-commits


@@ -1102,12 +1102,64 @@ struct DeferredStore {
 };
 } // namespace
 
+/// Check whether allocations for the given operation might potentially have to
+/// be done in device shared memory. That means we're compiling for a 
offloading
+/// target, the operation is an `omp::TargetOp` or nested inside of one and 
that
+/// target region represents a Generic (non-SPMD) kernel.
+///
+/// This represents a necessary but not sufficient set of conditions to use
+/// device shared memory in place of regular allocas. Depending on the 
variable,
+/// its uses or the associated OpenMP construct might also need to be taken 
into
+/// account.
+static bool
+mightAllocInDeviceSharedMemory(Operation &op,
+   const llvm::OpenMPIRBuilder &ompBuilder) {
+  if (!ompBuilder.Config.isTargetDevice())
+return false;
+
+  auto targetOp = dyn_cast(op);
+  if (!targetOp)
+targetOp = op.getParentOfType();
+
+  return targetOp &&
+ !bitEnumContainsAny(
+ targetOp.getKernelExecFlags(targetOp.getInnermostCapturedOmpOp()),
+ omp::TargetRegionFlags::spmd);
+}
+
+/// Check whether the entry block argument representing the private copy of a
+/// variable in an OpenMP construct must be allocated in device shared memory,
+/// based on what the uses of that copy are.
+///
+/// This must only be called if a previous call to
+/// \c mightAllocInDeviceSharedMemory has already returned \c true for the
+/// operation that owns the specified block argument.
+static bool mustAllocPrivateVarInDeviceSharedMemory(BlockArgument value) {
+  Operation *parentOp = value.getOwner()->getParentOp();
+  auto targetOp = dyn_cast(parentOp);
+  if (!targetOp)
+targetOp = parentOp->getParentOfType();
+  assert(targetOp && "expected a parent omp.target operation");
+
+  for (auto *user : value.getUsers()) {
+if (auto parallelOp = dyn_cast(user)) {

Meinersbur wrote:

```suggestion
if (auto *parallelOp = dyn_cast(user)) {
```

https://github.com/llvm/llvm-project/pull/150924
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [mlir] [MLIR][OpenMP] Support allocations of device shared memory (PR #150924)

2025-07-28 Thread Sergio Afonso via llvm-branch-commits

https://github.com/skatrak created 
https://github.com/llvm/llvm-project/pull/150924

This patch updates the allocation of some reduction and private variables 
within target regions to use device shared memory rather than private memory. 
This is a prerequisite to produce working Generic kernels containing parallel 
regions.

In particular, the following situations result in the usage of device shared 
memory (only when compiling for the target device if they are placed inside of 
a target region representing a Generic kernel):
- Reduction variables on `teams` constructs.
- Private variables on `teams` and `distribute` constructs that are reduced or 
used inside of a `parallel` region.

Currently, there is no support for delayed privatization on `teams` constructs, 
so private variables on these constructs won't currently be affected. When 
support is added, if it uses the existing `allocatePrivateVars` and 
`cleanupPrivateVars` functions, usage of device shared memory will be 
introduced automatically.

>From 0586e88f442e892ccdc1df03d86368773bc6e636 Mon Sep 17 00:00:00 2001
From: Sergio Afonso 
Date: Fri, 27 Jun 2025 13:29:13 +0100
Subject: [PATCH] [MLIR][OpenMP] Support allocations of device shared memory

This patch updates the allocation of some reduction and private variables
within target regions to use device shared memory rather than private memory.
This is a prerequisite to produce working Generic kernels containing parallel
regions.

In particular, the following situations result in the usage of device shared
memory (only when compiling for the target device if they are placed inside of
a target region representing a Generic kernel):
- Reduction variables on `teams` constructs.
- Private variables on `teams` and `distribute` constructs that are reduced or
used inside of a `parallel` region.

Currently, there is no support for delayed privatization on `teams` constructs,
so private variables on these constructs won't currently be affected. When
support is added, if it uses the existing `allocatePrivateVars` and
`cleanupPrivateVars` functions, usage of device shared memory will be
introduced automatically.
---
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp  | 227 +-
 .../omptarget-device-shared-memory.mlir   |  86 +++
 2 files changed, 253 insertions(+), 60 deletions(-)
 create mode 100644 mlir/test/Target/LLVMIR/omptarget-device-shared-memory.mlir

diff --git 
a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp 
b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 34358cdcece3c..c5a26cab553cf 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -1102,12 +1102,64 @@ struct DeferredStore {
 };
 } // namespace
 
+/// Check whether allocations for the given operation might potentially have to
+/// be done in device shared memory. That means we're compiling for a 
offloading
+/// target, the operation is an `omp::TargetOp` or nested inside of one and 
that
+/// target region represents a Generic (non-SPMD) kernel.
+///
+/// This represents a necessary but not sufficient set of conditions to use
+/// device shared memory in place of regular allocas. Depending on the 
variable,
+/// its uses or the associated OpenMP construct might also need to be taken 
into
+/// account.
+static bool
+mightAllocInDeviceSharedMemory(Operation &op,
+   const llvm::OpenMPIRBuilder &ompBuilder) {
+  if (!ompBuilder.Config.isTargetDevice())
+return false;
+
+  auto targetOp = dyn_cast(op);
+  if (!targetOp)
+targetOp = op.getParentOfType();
+
+  return targetOp &&
+ !bitEnumContainsAny(
+ targetOp.getKernelExecFlags(targetOp.getInnermostCapturedOmpOp()),
+ omp::TargetRegionFlags::spmd);
+}
+
+/// Check whether the entry block argument representing the private copy of a
+/// variable in an OpenMP construct must be allocated in device shared memory,
+/// based on what the uses of that copy are.
+///
+/// This must only be called if a previous call to
+/// \c mightAllocInDeviceSharedMemory has already returned \c true for the
+/// operation that owns the specified block argument.
+static bool mustAllocPrivateVarInDeviceSharedMemory(BlockArgument value) {
+  Operation *parentOp = value.getOwner()->getParentOp();
+  auto targetOp = dyn_cast(parentOp);
+  if (!targetOp)
+targetOp = parentOp->getParentOfType();
+  assert(targetOp && "expected a parent omp.target operation");
+
+  for (auto *user : value.getUsers()) {
+if (auto parallelOp = dyn_cast(user)) {
+  if (llvm::is_contained(parallelOp.getReductionVars(), value))
+return true;
+} else if (auto parallelOp = user->getParentOfType()) {
+  if (targetOp->isProperAncestor(parallelOp))
+return true;
+}
+  }
+
+  return false;
+}
+
 /// Allocate space for privatized reduction variables.
 /// `deferredStores

[llvm-branch-commits] [mlir] [MLIR][OpenMP] Support allocations of device shared memory (PR #150924)

2025-07-28 Thread Sergio Afonso via llvm-branch-commits

skatrak wrote:

PR stack:
- #150922
- #150923
- #150924 ◀️
- #150925
- #150926
- #150927

https://github.com/llvm/llvm-project/pull/150924
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [mlir] [MLIR][OpenMP] Support allocations of device shared memory (PR #150924)

2025-07-28 Thread via llvm-branch-commits

llvmbot wrote:



@llvm/pr-subscribers-mlir-llvm

@llvm/pr-subscribers-flang-openmp

Author: Sergio Afonso (skatrak)


Changes

This patch updates the allocation of some reduction and private variables 
within target regions to use device shared memory rather than private memory. 
This is a prerequisite to produce working Generic kernels containing parallel 
regions.

In particular, the following situations result in the usage of device shared 
memory (only when compiling for the target device if they are placed inside of 
a target region representing a Generic kernel):
- Reduction variables on `teams` constructs.
- Private variables on `teams` and `distribute` constructs that are reduced or 
used inside of a `parallel` region.

Currently, there is no support for delayed privatization on `teams` constructs, 
so private variables on these constructs won't currently be affected. When 
support is added, if it uses the existing `allocatePrivateVars` and 
`cleanupPrivateVars` functions, usage of device shared memory will be 
introduced automatically.

---

Patch is 23.49 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/150924.diff


2 Files Affected:

- (modified) 
mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp (+167-60) 
- (added) mlir/test/Target/LLVMIR/omptarget-device-shared-memory.mlir (+86) 


``diff
diff --git 
a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp 
b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 34358cdcece3c..c5a26cab553cf 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -1102,12 +1102,64 @@ struct DeferredStore {
 };
 } // namespace
 
+/// Check whether allocations for the given operation might potentially have to
+/// be done in device shared memory. That means we're compiling for a 
offloading
+/// target, the operation is an `omp::TargetOp` or nested inside of one and 
that
+/// target region represents a Generic (non-SPMD) kernel.
+///
+/// This represents a necessary but not sufficient set of conditions to use
+/// device shared memory in place of regular allocas. Depending on the 
variable,
+/// its uses or the associated OpenMP construct might also need to be taken 
into
+/// account.
+static bool
+mightAllocInDeviceSharedMemory(Operation &op,
+   const llvm::OpenMPIRBuilder &ompBuilder) {
+  if (!ompBuilder.Config.isTargetDevice())
+return false;
+
+  auto targetOp = dyn_cast(op);
+  if (!targetOp)
+targetOp = op.getParentOfType();
+
+  return targetOp &&
+ !bitEnumContainsAny(
+ targetOp.getKernelExecFlags(targetOp.getInnermostCapturedOmpOp()),
+ omp::TargetRegionFlags::spmd);
+}
+
+/// Check whether the entry block argument representing the private copy of a
+/// variable in an OpenMP construct must be allocated in device shared memory,
+/// based on what the uses of that copy are.
+///
+/// This must only be called if a previous call to
+/// \c mightAllocInDeviceSharedMemory has already returned \c true for the
+/// operation that owns the specified block argument.
+static bool mustAllocPrivateVarInDeviceSharedMemory(BlockArgument value) {
+  Operation *parentOp = value.getOwner()->getParentOp();
+  auto targetOp = dyn_cast(parentOp);
+  if (!targetOp)
+targetOp = parentOp->getParentOfType();
+  assert(targetOp && "expected a parent omp.target operation");
+
+  for (auto *user : value.getUsers()) {
+if (auto parallelOp = dyn_cast(user)) {
+  if (llvm::is_contained(parallelOp.getReductionVars(), value))
+return true;
+} else if (auto parallelOp = user->getParentOfType()) {
+  if (targetOp->isProperAncestor(parallelOp))
+return true;
+}
+  }
+
+  return false;
+}
+
 /// Allocate space for privatized reduction variables.
 /// `deferredStores` contains information to create store operations which 
needs
 /// to be inserted after all allocas
 template 
 static LogicalResult
-allocReductionVars(T loop, ArrayRef reductionArgs,
+allocReductionVars(T op, ArrayRef reductionArgs,
llvm::IRBuilderBase &builder,
LLVM::ModuleTranslation &moduleTranslation,
const llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
@@ -1119,10 +1171,14 @@ allocReductionVars(T loop, ArrayRef 
reductionArgs,
   llvm::IRBuilderBase::InsertPointGuard guard(builder);
   builder.SetInsertPoint(allocaIP.getBlock()->getTerminator());
 
+  llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
+  bool useDeviceSharedMem =
+  isa(op) && mightAllocInDeviceSharedMemory(*op, 
*ompBuilder);
+
   // delay creating stores until after all allocas
-  deferredStores.reserve(loop.getNumReductionVars());
+  deferredStores.reserve(op.getNumReductionVars());
 
-  for (std::size_t i = 0; i < loop.getNumReductionVars(); ++i) {