[llvm-branch-commits] [flang] [flang][cuda] Lower device/managed/unified allocation to cuda ops (PR #90526)

2024-04-30 Thread Valentin Clement バレンタイン クレメン via llvm-branch-commits

https://github.com/clementval closed 
https://github.com/llvm/llvm-project/pull/90526
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [flang] [flang][cuda] Lower device/managed/unified allocation to cuda ops (PR #90526)

2024-04-29 Thread Slava Zakharin via llvm-branch-commits

https://github.com/vzakhari approved this pull request.


https://github.com/llvm/llvm-project/pull/90526
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [flang] [flang][cuda] Lower device/managed/unified allocation to cuda ops (PR #90526)

2024-04-29 Thread Valentin Clement バレンタイン クレメン via llvm-branch-commits

clementval wrote:

> Thank you, Valentin!
> 
> Is it expected that we can have a mix of `fir.alloca` and `fir.cuda_alloc` 
> operations in the device routines (e.g. I suppose 
> `fir::FirOpBuilder::createTemporaryAlloc` can generate `fir.alloca` for a 
> temporary location in device code)? It is not necessarily an issue, I just 
> want to understand whether we will have to handle both operations in the 
> device code.

createTemporaryAlloc will also need to be modified to issue 
cuda_alloc/cuda_free. I'm still evaluating the extend of the change. fir.alloca 
are fine in device code as long as they are not device, managed or unified as 
we can support them with the address space. Note that creating managed or 
unified variabled in device subprogram is not recommended. 

https://github.com/llvm/llvm-project/pull/90526
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [flang] [flang][cuda] Lower device/managed/unified allocation to cuda ops (PR #90526)

2024-04-29 Thread Slava Zakharin via llvm-branch-commits

vzakhari wrote:

Thank you, Valentin!

Is it expected that we can have a mix of `fir.alloca` and `fir.cuda_alloc` 
operations in the device routines (e.g. I suppose 
`fir::FirOpBuilder::createTemporaryAlloc` can generate `fir.alloca` for a 
temporary location in device code)?  It is not necessarily an issue, I just 
want to understand whether we will have to handle both operations in the device 
code.

https://github.com/llvm/llvm-project/pull/90526
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [flang] [flang][cuda] Lower device/managed/unified allocation to cuda ops (PR #90526)

2024-04-29 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-flang-semantics

Author: Valentin Clement (バレンタイン クレメン) (clementval)


Changes

Lower locals allocation of cuda device, managed and unified variables to 
fir.cuda_alloc. Add fir.cuda_free in the function context finalization. 

---
Full diff: https://github.com/llvm/llvm-project/pull/90526.diff


6 Files Affected:

- (modified) flang/include/flang/Optimizer/Builder/FIRBuilder.h (+7) 
- (modified) flang/include/flang/Semantics/tools.h (+17) 
- (modified) flang/lib/Lower/ConvertVariable.cpp (+29) 
- (modified) flang/lib/Optimizer/Builder/FIRBuilder.cpp (+14-11) 
- (modified) flang/lib/Optimizer/Dialect/FIROps.cpp (+15) 
- (modified) flang/test/Lower/CUDA/cuda-data-attribute.cuf (+25) 


``diff
diff --git a/flang/include/flang/Optimizer/Builder/FIRBuilder.h 
b/flang/include/flang/Optimizer/Builder/FIRBuilder.h
index e4c954159f71be..0d650f830b64e0 100644
--- a/flang/include/flang/Optimizer/Builder/FIRBuilder.h
+++ b/flang/include/flang/Optimizer/Builder/FIRBuilder.h
@@ -708,6 +708,13 @@ mlir::Value createNullBoxProc(fir::FirOpBuilder , 
mlir::Location loc,
 
 /// Set internal linkage attribute on a function.
 void setInternalLinkage(mlir::func::FuncOp);
+
+llvm::SmallVector
+elideExtentsAlreadyInType(mlir::Type type, mlir::ValueRange shape);
+
+llvm::SmallVector
+elideLengthsAlreadyInType(mlir::Type type, mlir::ValueRange lenParams);
+
 } // namespace fir::factory
 
 #endif // FORTRAN_OPTIMIZER_BUILDER_FIRBUILDER_H
diff --git a/flang/include/flang/Semantics/tools.h 
b/flang/include/flang/Semantics/tools.h
index da10969ebc7021..c9eb5bc857ac01 100644
--- a/flang/include/flang/Semantics/tools.h
+++ b/flang/include/flang/Semantics/tools.h
@@ -222,6 +222,23 @@ inline bool HasCUDAAttr(const Symbol ) {
   return false;
 }
 
+inline bool NeedCUDAAlloc(const Symbol ) {
+  bool inDeviceSubprogram{IsCUDADeviceContext(())};
+  if (const auto *details{
+  sym.GetUltimate().detailsIf()}) {
+if (details->cudaDataAttr() &&
+(*details->cudaDataAttr() == common::CUDADataAttr::Device ||
+*details->cudaDataAttr() == common::CUDADataAttr::Managed ||
+*details->cudaDataAttr() == common::CUDADataAttr::Unified)) {
+  // Descriptor is allocated on host when in host context.
+  if (Fortran::semantics::IsAllocatable(sym))
+return inDeviceSubprogram;
+  return true;
+}
+  }
+  return false;
+}
+
 const Scope *FindCUDADeviceContext(const Scope *);
 std::optional GetCUDADataAttr(const Symbol *);
 
diff --git a/flang/lib/Lower/ConvertVariable.cpp 
b/flang/lib/Lower/ConvertVariable.cpp
index 21db0cac11bf6a..9a17acf5b15c36 100644
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -693,6 +693,22 @@ static mlir::Value 
createNewLocal(Fortran::lower::AbstractConverter ,
   if (ultimateSymbol.test(Fortran::semantics::Symbol::Flag::CrayPointee))
 return builder.create(loc, fir::ReferenceType::get(ty));
 
+  if (Fortran::semantics::NeedCUDAAlloc(ultimateSymbol)) {
+fir::CUDADataAttributeAttr cudaAttr =
+Fortran::lower::translateSymbolCUDADataAttribute(builder.getContext(),
+ ultimateSymbol);
+llvm::SmallVector indices;
+llvm::SmallVector elidedShape =
+fir::factory::elideExtentsAlreadyInType(ty, shape);
+llvm::SmallVector elidedLenParams =
+fir::factory::elideLengthsAlreadyInType(ty, lenParams);
+auto idxTy = builder.getIndexType();
+for (mlir::Value sh : elidedShape)
+  indices.push_back(builder.createConvert(loc, idxTy, sh));
+return builder.create(loc, ty, nm, symNm, cudaAttr,
+lenParams, indices);
+  }
+
   // Let the builder do all the heavy lifting.
   if (!Fortran::semantics::IsProcedurePointer(ultimateSymbol))
 return builder.allocateLocal(loc, ty, nm, symNm, shape, lenParams, isTarg);
@@ -927,6 +943,19 @@ static void 
instantiateLocal(Fortran::lower::AbstractConverter ,
   });
 }
   }
+  if (Fortran::semantics::NeedCUDAAlloc(var.getSymbol())) {
+auto *builder = ();
+mlir::Location loc = converter.getCurrentLocation();
+fir::ExtendedValue exv =
+converter.getSymbolExtendedValue(var.getSymbol(), );
+auto *sym = ();
+converter.getFctCtx().attachCleanup([builder, loc, exv, sym]() {
+  fir::CUDADataAttributeAttr cudaAttr =
+  Fortran::lower::translateSymbolCUDADataAttribute(
+  builder->getContext(), *sym);
+  builder->create(loc, fir::getBase(exv), cudaAttr);
+});
+  }
 }
 
 //======//
diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp 
b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
index a0fbae5b614cc7..a813b646087d73 100644
--- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp
+++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
@@ -176,8 +176,9 @@ mlir::Value 

[llvm-branch-commits] [flang] [flang][cuda] Lower device/managed/unified allocation to cuda ops (PR #90526)

2024-04-29 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-flang-fir-hlfir

Author: Valentin Clement (バレンタイン クレメン) (clementval)


Changes

Lower locals allocation of cuda device, managed and unified variables to 
fir.cuda_alloc. Add fir.cuda_free in the function context finalization. 

---
Full diff: https://github.com/llvm/llvm-project/pull/90526.diff


6 Files Affected:

- (modified) flang/include/flang/Optimizer/Builder/FIRBuilder.h (+7) 
- (modified) flang/include/flang/Semantics/tools.h (+17) 
- (modified) flang/lib/Lower/ConvertVariable.cpp (+29) 
- (modified) flang/lib/Optimizer/Builder/FIRBuilder.cpp (+14-11) 
- (modified) flang/lib/Optimizer/Dialect/FIROps.cpp (+15) 
- (modified) flang/test/Lower/CUDA/cuda-data-attribute.cuf (+25) 


``diff
diff --git a/flang/include/flang/Optimizer/Builder/FIRBuilder.h 
b/flang/include/flang/Optimizer/Builder/FIRBuilder.h
index e4c954159f71be..0d650f830b64e0 100644
--- a/flang/include/flang/Optimizer/Builder/FIRBuilder.h
+++ b/flang/include/flang/Optimizer/Builder/FIRBuilder.h
@@ -708,6 +708,13 @@ mlir::Value createNullBoxProc(fir::FirOpBuilder , 
mlir::Location loc,
 
 /// Set internal linkage attribute on a function.
 void setInternalLinkage(mlir::func::FuncOp);
+
+llvm::SmallVector
+elideExtentsAlreadyInType(mlir::Type type, mlir::ValueRange shape);
+
+llvm::SmallVector
+elideLengthsAlreadyInType(mlir::Type type, mlir::ValueRange lenParams);
+
 } // namespace fir::factory
 
 #endif // FORTRAN_OPTIMIZER_BUILDER_FIRBUILDER_H
diff --git a/flang/include/flang/Semantics/tools.h 
b/flang/include/flang/Semantics/tools.h
index da10969ebc7021..c9eb5bc857ac01 100644
--- a/flang/include/flang/Semantics/tools.h
+++ b/flang/include/flang/Semantics/tools.h
@@ -222,6 +222,23 @@ inline bool HasCUDAAttr(const Symbol ) {
   return false;
 }
 
+inline bool NeedCUDAAlloc(const Symbol ) {
+  bool inDeviceSubprogram{IsCUDADeviceContext(())};
+  if (const auto *details{
+  sym.GetUltimate().detailsIf()}) {
+if (details->cudaDataAttr() &&
+(*details->cudaDataAttr() == common::CUDADataAttr::Device ||
+*details->cudaDataAttr() == common::CUDADataAttr::Managed ||
+*details->cudaDataAttr() == common::CUDADataAttr::Unified)) {
+  // Descriptor is allocated on host when in host context.
+  if (Fortran::semantics::IsAllocatable(sym))
+return inDeviceSubprogram;
+  return true;
+}
+  }
+  return false;
+}
+
 const Scope *FindCUDADeviceContext(const Scope *);
 std::optional GetCUDADataAttr(const Symbol *);
 
diff --git a/flang/lib/Lower/ConvertVariable.cpp 
b/flang/lib/Lower/ConvertVariable.cpp
index 21db0cac11bf6a..9a17acf5b15c36 100644
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -693,6 +693,22 @@ static mlir::Value 
createNewLocal(Fortran::lower::AbstractConverter ,
   if (ultimateSymbol.test(Fortran::semantics::Symbol::Flag::CrayPointee))
 return builder.create(loc, fir::ReferenceType::get(ty));
 
+  if (Fortran::semantics::NeedCUDAAlloc(ultimateSymbol)) {
+fir::CUDADataAttributeAttr cudaAttr =
+Fortran::lower::translateSymbolCUDADataAttribute(builder.getContext(),
+ ultimateSymbol);
+llvm::SmallVector indices;
+llvm::SmallVector elidedShape =
+fir::factory::elideExtentsAlreadyInType(ty, shape);
+llvm::SmallVector elidedLenParams =
+fir::factory::elideLengthsAlreadyInType(ty, lenParams);
+auto idxTy = builder.getIndexType();
+for (mlir::Value sh : elidedShape)
+  indices.push_back(builder.createConvert(loc, idxTy, sh));
+return builder.create(loc, ty, nm, symNm, cudaAttr,
+lenParams, indices);
+  }
+
   // Let the builder do all the heavy lifting.
   if (!Fortran::semantics::IsProcedurePointer(ultimateSymbol))
 return builder.allocateLocal(loc, ty, nm, symNm, shape, lenParams, isTarg);
@@ -927,6 +943,19 @@ static void 
instantiateLocal(Fortran::lower::AbstractConverter ,
   });
 }
   }
+  if (Fortran::semantics::NeedCUDAAlloc(var.getSymbol())) {
+auto *builder = ();
+mlir::Location loc = converter.getCurrentLocation();
+fir::ExtendedValue exv =
+converter.getSymbolExtendedValue(var.getSymbol(), );
+auto *sym = ();
+converter.getFctCtx().attachCleanup([builder, loc, exv, sym]() {
+  fir::CUDADataAttributeAttr cudaAttr =
+  Fortran::lower::translateSymbolCUDADataAttribute(
+  builder->getContext(), *sym);
+  builder->create(loc, fir::getBase(exv), cudaAttr);
+});
+  }
 }
 
 //======//
diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp 
b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
index a0fbae5b614cc7..a813b646087d73 100644
--- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp
+++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
@@ -176,8 +176,9 @@ mlir::Value 

[llvm-branch-commits] [flang] [flang][cuda] Lower device/managed/unified allocation to cuda ops (PR #90526)

2024-04-29 Thread Valentin Clement バレンタイン クレメン via llvm-branch-commits

https://github.com/clementval created 
https://github.com/llvm/llvm-project/pull/90526

Lower locals allocation of cuda device, managed and unified variables to 
fir.cuda_alloc. Add fir.cuda_free in the function context finalization. 

>From 02d1ef45cae1ba973a51e5898f092403395c Mon Sep 17 00:00:00 2001
From: Valentin Clement 
Date: Mon, 29 Apr 2024 14:30:46 -0700
Subject: [PATCH] [flang][cuda] Lower device/managed/unified allocation to cuda
 ops

---
 .../flang/Optimizer/Builder/FIRBuilder.h  |  7 +
 flang/include/flang/Semantics/tools.h | 17 +++
 flang/lib/Lower/ConvertVariable.cpp   | 29 +++
 flang/lib/Optimizer/Builder/FIRBuilder.cpp| 25 +---
 flang/lib/Optimizer/Dialect/FIROps.cpp| 15 ++
 flang/test/Lower/CUDA/cuda-data-attribute.cuf | 25 
 6 files changed, 107 insertions(+), 11 deletions(-)

diff --git a/flang/include/flang/Optimizer/Builder/FIRBuilder.h 
b/flang/include/flang/Optimizer/Builder/FIRBuilder.h
index e4c954159f71be..0d650f830b64e0 100644
--- a/flang/include/flang/Optimizer/Builder/FIRBuilder.h
+++ b/flang/include/flang/Optimizer/Builder/FIRBuilder.h
@@ -708,6 +708,13 @@ mlir::Value createNullBoxProc(fir::FirOpBuilder , 
mlir::Location loc,
 
 /// Set internal linkage attribute on a function.
 void setInternalLinkage(mlir::func::FuncOp);
+
+llvm::SmallVector
+elideExtentsAlreadyInType(mlir::Type type, mlir::ValueRange shape);
+
+llvm::SmallVector
+elideLengthsAlreadyInType(mlir::Type type, mlir::ValueRange lenParams);
+
 } // namespace fir::factory
 
 #endif // FORTRAN_OPTIMIZER_BUILDER_FIRBUILDER_H
diff --git a/flang/include/flang/Semantics/tools.h 
b/flang/include/flang/Semantics/tools.h
index da10969ebc7021..c9eb5bc857ac01 100644
--- a/flang/include/flang/Semantics/tools.h
+++ b/flang/include/flang/Semantics/tools.h
@@ -222,6 +222,23 @@ inline bool HasCUDAAttr(const Symbol ) {
   return false;
 }
 
+inline bool NeedCUDAAlloc(const Symbol ) {
+  bool inDeviceSubprogram{IsCUDADeviceContext(())};
+  if (const auto *details{
+  sym.GetUltimate().detailsIf()}) {
+if (details->cudaDataAttr() &&
+(*details->cudaDataAttr() == common::CUDADataAttr::Device ||
+*details->cudaDataAttr() == common::CUDADataAttr::Managed ||
+*details->cudaDataAttr() == common::CUDADataAttr::Unified)) {
+  // Descriptor is allocated on host when in host context.
+  if (Fortran::semantics::IsAllocatable(sym))
+return inDeviceSubprogram;
+  return true;
+}
+  }
+  return false;
+}
+
 const Scope *FindCUDADeviceContext(const Scope *);
 std::optional GetCUDADataAttr(const Symbol *);
 
diff --git a/flang/lib/Lower/ConvertVariable.cpp 
b/flang/lib/Lower/ConvertVariable.cpp
index 21db0cac11bf6a..9a17acf5b15c36 100644
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -693,6 +693,22 @@ static mlir::Value 
createNewLocal(Fortran::lower::AbstractConverter ,
   if (ultimateSymbol.test(Fortran::semantics::Symbol::Flag::CrayPointee))
 return builder.create(loc, fir::ReferenceType::get(ty));
 
+  if (Fortran::semantics::NeedCUDAAlloc(ultimateSymbol)) {
+fir::CUDADataAttributeAttr cudaAttr =
+Fortran::lower::translateSymbolCUDADataAttribute(builder.getContext(),
+ ultimateSymbol);
+llvm::SmallVector indices;
+llvm::SmallVector elidedShape =
+fir::factory::elideExtentsAlreadyInType(ty, shape);
+llvm::SmallVector elidedLenParams =
+fir::factory::elideLengthsAlreadyInType(ty, lenParams);
+auto idxTy = builder.getIndexType();
+for (mlir::Value sh : elidedShape)
+  indices.push_back(builder.createConvert(loc, idxTy, sh));
+return builder.create(loc, ty, nm, symNm, cudaAttr,
+lenParams, indices);
+  }
+
   // Let the builder do all the heavy lifting.
   if (!Fortran::semantics::IsProcedurePointer(ultimateSymbol))
 return builder.allocateLocal(loc, ty, nm, symNm, shape, lenParams, isTarg);
@@ -927,6 +943,19 @@ static void 
instantiateLocal(Fortran::lower::AbstractConverter ,
   });
 }
   }
+  if (Fortran::semantics::NeedCUDAAlloc(var.getSymbol())) {
+auto *builder = ();
+mlir::Location loc = converter.getCurrentLocation();
+fir::ExtendedValue exv =
+converter.getSymbolExtendedValue(var.getSymbol(), );
+auto *sym = ();
+converter.getFctCtx().attachCleanup([builder, loc, exv, sym]() {
+  fir::CUDADataAttributeAttr cudaAttr =
+  Fortran::lower::translateSymbolCUDADataAttribute(
+  builder->getContext(), *sym);
+  builder->create(loc, fir::getBase(exv), cudaAttr);
+});
+  }
 }
 
 //======//
diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp 
b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
index a0fbae5b614cc7..a813b646087d73 100644
---