https://github.com/erichkeane updated https://github.com/llvm/llvm-project/pull/143751
>From 644612d088f28a21f7f59496de00f8c14de89c1d Mon Sep 17 00:00:00 2001 From: erichkeane <eke...@nvidia.com> Date: Wed, 11 Jun 2025 09:54:46 -0700 Subject: [PATCH 1/2] [OpenACC][CIR] Add parallelism determ. to all acc.loops PR #143720 adds a requirement to the ACC dialect that every acc.loop must have a seq, independent, or auto attribute for the 'default' device_type. The standard has rules for how this can be intuited: orphan/parallel/parallel loop: independent kernels/kernels loop: auto serial/serial loop: seq, unless there is a gang/worker/vector, at which point it should be 'auto'. This patch implements all of this rule as a 'cleanup' step on the IR generation for combined/loop operations. Note that the test impact is much less since I inadvertently have my 'operation' terminating curley matching the end curley from 'attribute' instead of the front of the line, so I've added sufficient tests to ensure I captured the above. --- clang/lib/CIR/CodeGen/CIRGenFunction.h | 12 +++ clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp | 2 + .../lib/CIR/CodeGen/CIRGenStmtOpenACCLoop.cpp | 60 +++++++++++ clang/test/CIR/CodeGenOpenACC/combined.cpp | 69 ++++++++++-- clang/test/CIR/CodeGenOpenACC/loop.cpp | 101 ++++++++++++++++-- 5 files changed, 227 insertions(+), 17 deletions(-) diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h index b08dd540e6289..682d59d63faa8 100644 --- a/clang/lib/CIR/CodeGen/CIRGenFunction.h +++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h @@ -34,6 +34,12 @@ namespace { class ScalarExprEmitter; } // namespace +namespace mlir { +namespace acc { +class LoopOp; +} // namespace acc +} // namespace mlir + namespace clang::CIRGen { class CIRGenFunction : public CIRGenTypeCache { @@ -1082,6 +1088,12 @@ class CIRGenFunction : public CIRGenTypeCache { OpenACCDirectiveKind dirKind, SourceLocation dirLoc, ArrayRef<const OpenACCClause *> clauses); + // The OpenACC LoopOp requires that we have auto, seq, or independent on all + // LoopOp operations for the 'none' device type case. This function checks if + // the LoopOp has one, else it updates it to have one. + void updateLoopOpParallelism(mlir::acc::LoopOp &op, bool isOrphan, + OpenACCDirectiveKind dk); + public: mlir::LogicalResult emitOpenACCComputeConstruct(const OpenACCComputeConstruct &s); diff --git a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp index 2aab9cecf93d8..1feefa55eb270 100644 --- a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp @@ -102,6 +102,8 @@ mlir::LogicalResult CIRGenFunction::emitOpenACCOpCombinedConstruct( emitOpenACCClauses(computeOp, loopOp, dirKind, dirLoc, clauses); + updateLoopOpParallelism(loopOp, /*isOrphan=*/false, dirKind); + builder.create<TermOp>(end); } diff --git a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACCLoop.cpp b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACCLoop.cpp index 24cd1d399de65..2082ef65193ba 100644 --- a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACCLoop.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACCLoop.cpp @@ -22,6 +22,63 @@ using namespace clang::CIRGen; using namespace cir; using namespace mlir::acc; +void CIRGenFunction::updateLoopOpParallelism(mlir::acc::LoopOp &op, + bool isOrphan, + OpenACCDirectiveKind dk) { + // Check that at least one of auto, independent, or seq is present + // for the device-independent default clauses. + auto hasDeviceNone = [](mlir::acc::DeviceTypeAttr attr) -> bool { + return attr.getValue() == mlir::acc::DeviceType::None; + }; + bool hasDefaultSeq = + op.getSeqAttr() + ? llvm::any_of( + op.getSeqAttr().getAsRange<mlir::acc::DeviceTypeAttr>(), + hasDeviceNone) + : false; + bool hasDefaultIndependent = + op.getIndependentAttr() + ? llvm::any_of( + op.getIndependentAttr().getAsRange<mlir::acc::DeviceTypeAttr>(), + hasDeviceNone) + : false; + bool hasDefaultAuto = + op.getAuto_Attr() + ? llvm::any_of( + op.getAuto_Attr().getAsRange<mlir::acc::DeviceTypeAttr>(), + hasDeviceNone) + : false; + + if (hasDefaultSeq || hasDefaultIndependent || hasDefaultAuto) + return; + + // Orphan or parallel results in 'independent'. + if (isOrphan || dk == OpenACCDirectiveKind::Parallel || + dk == OpenACCDirectiveKind::ParallelLoop) { + op.addIndependent(builder.getContext(), {}); + return; + } + + // Kernels always results in 'auto'. + if (dk == OpenACCDirectiveKind::Kernels || + dk == OpenACCDirectiveKind::KernelsLoop) { + op.addAuto(builder.getContext(), {}); + return; + } + + // Serial should use 'seq' unless there is a gang, worker, or vector clause, + // in which case, it should use 'auto'. + assert(dk == OpenACCDirectiveKind::Serial || + dk == OpenACCDirectiveKind::SerialLoop); + + if (op.getWorkerAttr() || op.getVectorAttr() || op.getGangAttr()) { + op.addAuto(builder.getContext(), {}); + return; + } + + op.addSeq(builder.getContext(), {}); +} + mlir::LogicalResult CIRGenFunction::emitOpenACCLoopConstruct(const OpenACCLoopConstruct &s) { mlir::Location start = getLoc(s.getSourceRange().getBegin()); @@ -90,6 +147,9 @@ CIRGenFunction::emitOpenACCLoopConstruct(const OpenACCLoopConstruct &s) { emitOpenACCClauses(op, s.getDirectiveKind(), s.getDirectiveLoc(), s.clauses()); + updateLoopOpParallelism(op, s.isOrphanedLoopConstruct(), + s.getParentComputeConstructKind()); + mlir::LogicalResult stmtRes = mlir::success(); // Emit body. { diff --git a/clang/test/CIR/CodeGenOpenACC/combined.cpp b/clang/test/CIR/CodeGenOpenACC/combined.cpp index 1f3c9f1a8d3fa..5b83a9cb91898 100644 --- a/clang/test/CIR/CodeGenOpenACC/combined.cpp +++ b/clang/test/CIR/CodeGenOpenACC/combined.cpp @@ -74,7 +74,7 @@ extern "C" void acc_combined(int N, int cond) { // CHECK: acc.serial combined(loop) { // CHECK: acc.loop combined(serial) { // CHECK: acc.yield - // CHECK-NEXT: } attributes {seq = [#acc.device_type<nvidia>, #acc.device_type<radeon>]} loc + // CHECK-NEXT: } attributes {seq = [#acc.device_type<nvidia>, #acc.device_type<radeon>, #acc.device_type<none>]} loc // CHECK: acc.yield // CHECK-NEXT: } loc #pragma acc kernels loop seq device_type(nvidia, radeon) @@ -99,7 +99,7 @@ extern "C" void acc_combined(int N, int cond) { // CHECK: acc.serial combined(loop) { // CHECK: acc.loop combined(serial) { // CHECK: acc.yield - // CHECK-NEXT: } attributes {auto_ = [#acc.device_type<nvidia>, #acc.device_type<radeon>]} loc + // CHECK-NEXT: } attributes {auto_ = [#acc.device_type<nvidia>, #acc.device_type<radeon>], seq = [#acc.device_type<none>]} loc // CHECK: acc.yield // CHECK-NEXT: } loc #pragma acc kernels loop auto device_type(nvidia, radeon) @@ -124,7 +124,7 @@ extern "C" void acc_combined(int N, int cond) { // CHECK: acc.serial combined(loop) { // CHECK: acc.loop combined(serial) { // CHECK: acc.yield - // CHECK-NEXT: } attributes {independent = [#acc.device_type<nvidia>, #acc.device_type<radeon>]} loc + // CHECK-NEXT: } attributes {independent = [#acc.device_type<nvidia>, #acc.device_type<radeon>], seq = [#acc.device_type<none>]} loc // CHECK: acc.yield // CHECK-NEXT: } loc #pragma acc kernels loop independent device_type(nvidia, radeon) @@ -143,7 +143,7 @@ extern "C" void acc_combined(int N, int cond) { // CHECK: acc.parallel combined(loop) { // CHECK: acc.loop combined(parallel) { // CHECK: acc.yield - // CHECK-NEXT: } attributes {collapse = [1], collapseDeviceType = [#acc.device_type<none>]} + // CHECK-NEXT: } attributes {collapse = [1], collapseDeviceType = [#acc.device_type<none>], independent = [#acc.device_type<none>]} // CHECK: acc.yield // CHECK-NEXT: } loc @@ -154,7 +154,7 @@ extern "C" void acc_combined(int N, int cond) { // CHECK: acc.serial combined(loop) { // CHECK: acc.loop combined(serial) { // CHECK: acc.yield - // CHECK-NEXT: } attributes {collapse = [1, 2], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>]} + // CHECK-NEXT: } attributes {collapse = [1, 2], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>], seq = [#acc.device_type<none>]} // CHECK: acc.yield // CHECK-NEXT: } loc @@ -165,7 +165,7 @@ extern "C" void acc_combined(int N, int cond) { // CHECK: acc.kernels combined(loop) { // CHECK: acc.loop combined(kernels) { // CHECK: acc.yield - // CHECK-NEXT: } attributes {collapse = [1, 2, 2], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>, #acc.device_type<nvidia>]} + // CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>], collapse = [1, 2, 2], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>, #acc.device_type<nvidia>]} // CHECK: acc.terminator // CHECK-NEXT: } loc #pragma acc parallel loop collapse(1) device_type(radeon, nvidia) collapse(2) device_type(host) collapse(3) @@ -175,7 +175,7 @@ extern "C" void acc_combined(int N, int cond) { // CHECK: acc.parallel combined(loop) { // CHECK: acc.loop combined(parallel) { // CHECK: acc.yield - // CHECK-NEXT: } attributes {collapse = [1, 2, 2, 3], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>, #acc.device_type<nvidia>, #acc.device_type<host>]} + // CHECK-NEXT: } attributes {collapse = [1, 2, 2, 3], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>, #acc.device_type<nvidia>, #acc.device_type<host>], independent = [#acc.device_type<none>]} // CHECK: acc.yield // CHECK-NEXT: } loc @@ -1184,4 +1184,59 @@ extern "C" void acc_combined_data_clauses(int *arg1, int *arg2) { // CHECK-NEXT: } loc // CHECK-NEXT: acc.detach accPtr(%[[ATTACH2]] : !cir.ptr<!cir.ptr<!s32i>>) async([#acc.device_type<host>]) {dataClause = #acc<data_clause acc_attach>, name = "arg2"} // CHECK-NEXT: acc.detach accPtr(%[[ATTACH1]] : !cir.ptr<!cir.ptr<!s32i>>) async([#acc.device_type<host>]) {dataClause = #acc<data_clause acc_attach>, name = "arg1"} + + // Checking the automatic-addition of parallelism clauses. +#pragma acc parallel loop + for(unsigned I = 0; I < 5; ++I); + // CHECK-NEXT: acc.parallel combined(loop) { + // CHECK-NEXT: acc.loop combined(parallel) { + // CHECK: acc.yield + // CHECK-NEXT: } attributes {independent = [#acc.device_type<none>]} loc + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc kernels loop + for(unsigned I = 0; I < 5; ++I); + // CHECK-NEXT: acc.kernels combined(loop) { + // CHECK-NEXT: acc.loop combined(kernels) { + // CHECK: acc.yield + // CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]} loc + // CHECK-NEXT: acc.terminator + // CHECK-NEXT: } loc + +#pragma acc serial loop + for(unsigned I = 0; I < 5; ++I); + // CHECK-NEXT: acc.serial combined(loop) { + // CHECK-NEXT: acc.loop combined(serial) { + // CHECK: acc.yield + // CHECK-NEXT: } attributes {seq = [#acc.device_type<none>]} loc + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc serial loop worker + for(unsigned I = 0; I < 5; ++I); + // CHECK-NEXT: acc.serial combined(loop) { + // CHECK-NEXT: acc.loop combined(serial) worker { + // CHECK: acc.yield + // CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]} loc + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc serial loop vector + for(unsigned I = 0; I < 5; ++I); + // CHECK-NEXT: acc.serial combined(loop) { + // CHECK-NEXT: acc.loop combined(serial) vector { + // CHECK: acc.yield + // CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]} loc + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc serial loop gang + for(unsigned I = 0; I < 5; ++I); + // CHECK-NEXT: acc.serial combined(loop) { + // CHECK-NEXT: acc.loop combined(serial) gang { + // CHECK: acc.yield + // CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]} loc + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc } diff --git a/clang/test/CIR/CodeGenOpenACC/loop.cpp b/clang/test/CIR/CodeGenOpenACC/loop.cpp index db94e2819b301..c0bf11e353951 100644 --- a/clang/test/CIR/CodeGenOpenACC/loop.cpp +++ b/clang/test/CIR/CodeGenOpenACC/loop.cpp @@ -41,12 +41,12 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) { for(unsigned I = 0; I < N; ++I); // CHECK: acc.loop { // CHECK: acc.yield - // CHECK-NEXT: } attributes {seq = [#acc.device_type<nvidia>, #acc.device_type<radeon>]} loc + // CHECK-NEXT: } attributes {independent = [#acc.device_type<none>], seq = [#acc.device_type<nvidia>, #acc.device_type<radeon>]} loc #pragma acc loop device_type(radeon) seq for(unsigned I = 0; I < N; ++I); // CHECK: acc.loop { // CHECK: acc.yield - // CHECK-NEXT: } attributes {seq = [#acc.device_type<radeon>]} loc + // CHECK-NEXT: } attributes {independent = [#acc.device_type<none>], seq = [#acc.device_type<radeon>]} loc #pragma acc loop seq device_type(nvidia, radeon) for(unsigned I = 0; I < N; ++I); // CHECK: acc.loop { @@ -67,12 +67,12 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) { for(unsigned I = 0; I < N; ++I); // CHECK: acc.loop { // CHECK: acc.yield - // CHECK-NEXT: } attributes {independent = [#acc.device_type<nvidia>, #acc.device_type<radeon>]} loc + // CHECK-NEXT: } attributes {independent = [#acc.device_type<nvidia>, #acc.device_type<radeon>, #acc.device_type<none>]} loc #pragma acc loop device_type(radeon) independent for(unsigned I = 0; I < N; ++I); // CHECK: acc.loop { // CHECK: acc.yield - // CHECK-NEXT: } attributes {independent = [#acc.device_type<radeon>]} loc + // CHECK-NEXT: } attributes {independent = [#acc.device_type<radeon>, #acc.device_type<none>]} loc #pragma acc loop independent device_type(nvidia, radeon) for(unsigned I = 0; I < N; ++I); // CHECK: acc.loop { @@ -93,12 +93,12 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) { for(unsigned I = 0; I < N; ++I); // CHECK: acc.loop { // CHECK: acc.yield - // CHECK-NEXT: } attributes {auto_ = [#acc.device_type<nvidia>, #acc.device_type<radeon>]} loc + // CHECK-NEXT: } attributes {auto_ = [#acc.device_type<nvidia>, #acc.device_type<radeon>], independent = [#acc.device_type<none>]} loc #pragma acc loop device_type(radeon) auto for(unsigned I = 0; I < N; ++I); // CHECK: acc.loop { // CHECK: acc.yield - // CHECK-NEXT: } attributes {auto_ = [#acc.device_type<radeon>]} loc + // CHECK-NEXT: } attributes {auto_ = [#acc.device_type<radeon>], independent = [#acc.device_type<none>]} loc #pragma acc loop auto device_type(nvidia, radeon) for(unsigned I = 0; I < N; ++I); // CHECK: acc.loop { @@ -116,7 +116,7 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) { for(unsigned K = 0; K < N; ++K); // CHECK: acc.loop { // CHECK: acc.yield - // CHECK-NEXT: } attributes {collapse = [1], collapseDeviceType = [#acc.device_type<none>]} + // CHECK-NEXT: } attributes {collapse = [1], collapseDeviceType = [#acc.device_type<none>], independent = [#acc.device_type<none>]} #pragma acc loop collapse(1) device_type(radeon) collapse (2) for(unsigned I = 0; I < N; ++I) @@ -124,7 +124,7 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) { for(unsigned K = 0; K < N; ++K); // CHECK: acc.loop { // CHECK: acc.yield - // CHECK-NEXT: } attributes {collapse = [1, 2], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>]} + // CHECK-NEXT: } attributes {collapse = [1, 2], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>], independent = [#acc.device_type<none>]} #pragma acc loop collapse(1) device_type(radeon, nvidia) collapse (2) for(unsigned I = 0; I < N; ++I) @@ -132,14 +132,14 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) { for(unsigned K = 0; K < N; ++K); // CHECK: acc.loop { // CHECK: acc.yield - // CHECK-NEXT: } attributes {collapse = [1, 2, 2], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>, #acc.device_type<nvidia>]} + // CHECK-NEXT: } attributes {collapse = [1, 2, 2], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>, #acc.device_type<nvidia>], independent = [#acc.device_type<none>]} #pragma acc loop collapse(1) device_type(radeon, nvidia) collapse(2) device_type(host) collapse(3) for(unsigned I = 0; I < N; ++I) for(unsigned J = 0; J < N; ++J) for(unsigned K = 0; K < N; ++K); // CHECK: acc.loop { // CHECK: acc.yield - // CHECK-NEXT: } attributes {collapse = [1, 2, 2, 3], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>, #acc.device_type<nvidia>, #acc.device_type<host>]} + // CHECK-NEXT: } attributes {collapse = [1, 2, 2, 3], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>, #acc.device_type<nvidia>, #acc.device_type<host>], independent = [#acc.device_type<none>]} #pragma acc loop tile(1, 2, 3) for(unsigned I = 0; I < N; ++I) @@ -392,4 +392,85 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) { // CHECK: acc.yield // CHECK-NEXT: } loc } + // CHECK-NEXT: acc.terminator + // CHECK-NEXT: } loc + + // Checking the automatic-addition of parallelism clauses. +#pragma acc loop + for(unsigned I = 0; I < N; ++I); + // CHECK-NEXT: acc.loop { + // CHECK: acc.yield + // CHECK-NEXT: } attributes {independent = [#acc.device_type<none>]} loc + +#pragma acc parallel + { + // CHECK-NEXT: acc.parallel { +#pragma acc loop + for(unsigned I = 0; I < N; ++I); + // CHECK-NEXT: acc.loop { + // CHECK: acc.yield + // CHECK-NEXT: } attributes {independent = [#acc.device_type<none>]} loc + } + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc kernels + { + // CHECK-NEXT: acc.kernels { +#pragma acc loop + for(unsigned I = 0; I < N; ++I); + // CHECK-NEXT: acc.loop { + // CHECK: acc.yield + // CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]} loc + } + // CHECK-NEXT: acc.terminator + // CHECK-NEXT: } loc + +#pragma acc serial + { + // CHECK-NEXT: acc.serial { +#pragma acc loop + for(unsigned I = 0; I < N; ++I); + // CHECK-NEXT: acc.loop { + // CHECK: acc.yield + // CHECK-NEXT: } attributes {seq = [#acc.device_type<none>]} loc + } + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc serial + { + // CHECK-NEXT: acc.serial { +#pragma acc loop worker + for(unsigned I = 0; I < N; ++I); + // CHECK-NEXT: acc.loop worker { + // CHECK: acc.yield + // CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]} loc + } + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc serial + { + // CHECK-NEXT: acc.serial { +#pragma acc loop vector + for(unsigned I = 0; I < N; ++I); + // CHECK-NEXT: acc.loop vector { + // CHECK: acc.yield + // CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]} loc + } + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc serial + { + // CHECK-NEXT: acc.serial { +#pragma acc loop gang + for(unsigned I = 0; I < N; ++I); + // CHECK-NEXT: acc.loop gang { + // CHECK: acc.yield + // CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]} loc + } + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc } >From 39c443bda09cc7e7bf459256284208a2d0811333 Mon Sep 17 00:00:00 2001 From: erichkeane <eke...@nvidia.com> Date: Wed, 11 Jun 2025 10:59:50 -0700 Subject: [PATCH 2/2] Correct /simplify The gang/worker/vector check was insufficient based on review, so this fixes it. It also moves the check for the ParallelismFlag and gang/worker/vector check to LoopOp so that it can be used elsewhere. We also can simplify the Clang version here for the same reasons. --- .../lib/CIR/CodeGen/CIRGenStmtOpenACCLoop.cpp | 59 +++++-------------- .../mlir/Dialect/OpenACC/OpenACCOps.td | 8 +++ mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp | 24 ++++++++ 3 files changed, 48 insertions(+), 43 deletions(-) diff --git a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACCLoop.cpp b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACCLoop.cpp index 2082ef65193ba..71f3ccb8e040e 100644 --- a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACCLoop.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACCLoop.cpp @@ -27,56 +27,29 @@ void CIRGenFunction::updateLoopOpParallelism(mlir::acc::LoopOp &op, OpenACCDirectiveKind dk) { // Check that at least one of auto, independent, or seq is present // for the device-independent default clauses. - auto hasDeviceNone = [](mlir::acc::DeviceTypeAttr attr) -> bool { - return attr.getValue() == mlir::acc::DeviceType::None; - }; - bool hasDefaultSeq = - op.getSeqAttr() - ? llvm::any_of( - op.getSeqAttr().getAsRange<mlir::acc::DeviceTypeAttr>(), - hasDeviceNone) - : false; - bool hasDefaultIndependent = - op.getIndependentAttr() - ? llvm::any_of( - op.getIndependentAttr().getAsRange<mlir::acc::DeviceTypeAttr>(), - hasDeviceNone) - : false; - bool hasDefaultAuto = - op.getAuto_Attr() - ? llvm::any_of( - op.getAuto_Attr().getAsRange<mlir::acc::DeviceTypeAttr>(), - hasDeviceNone) - : false; - - if (hasDefaultSeq || hasDefaultIndependent || hasDefaultAuto) + if (op.hasParallelismFlag(mlir::acc::DeviceType::None)) return; - // Orphan or parallel results in 'independent'. - if (isOrphan || dk == OpenACCDirectiveKind::Parallel || - dk == OpenACCDirectiveKind::ParallelLoop) { + switch (dk) { + default: + llvm_unreachable("Invalid parent directive kind"); + case OpenACCDirectiveKind::Invalid: + case OpenACCDirectiveKind::Parallel: + case OpenACCDirectiveKind::ParallelLoop: op.addIndependent(builder.getContext(), {}); return; - } - - // Kernels always results in 'auto'. - if (dk == OpenACCDirectiveKind::Kernels || - dk == OpenACCDirectiveKind::KernelsLoop) { + case OpenACCDirectiveKind::Kernels: + case OpenACCDirectiveKind::KernelsLoop: op.addAuto(builder.getContext(), {}); return; - } - - // Serial should use 'seq' unless there is a gang, worker, or vector clause, - // in which case, it should use 'auto'. - assert(dk == OpenACCDirectiveKind::Serial || - dk == OpenACCDirectiveKind::SerialLoop); - - if (op.getWorkerAttr() || op.getVectorAttr() || op.getGangAttr()) { - op.addAuto(builder.getContext(), {}); + case OpenACCDirectiveKind::Serial: + case OpenACCDirectiveKind::SerialLoop: + if (op.hasDefaultGangWorkerVector()) + op.addAuto(builder.getContext(), {}); + else + op.addSeq(builder.getContext(), {}); return; - } - - op.addSeq(builder.getContext(), {}); + }; } mlir::LogicalResult diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td index 083a18d80704e..34312655115a1 100644 --- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td +++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td @@ -2246,6 +2246,14 @@ def OpenACC_LoopOp : OpenACC_Op<"loop", // device_types. This is for the case where there is no expression specified // in a 'gang'. void addEmptyGang(MLIRContext *, llvm::ArrayRef<DeviceType>); + + // Return whether this LoopOp has an auto, seq, or independent for the + // specified device-type. + bool hasParallelismFlag(DeviceType); + + // Return whether this LoopOp has a gang, worker, or vector applying to the + // 'default'/None device-type. + bool hasDefaultGangWorkerVector(); }]; let hasCustomAssemblyFormat = 1; diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp index c72ec47be9f04..21e6b9d85f1a1 100644 --- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp +++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp @@ -2839,6 +2839,30 @@ void acc::LoopOp::addEmptyGang( effectiveDeviceTypes)); } +bool acc::LoopOp::hasParallelismFlag(DeviceType dt) { + auto hasDevice = [=](DeviceTypeAttr attr) -> bool { + return attr.getValue() == dt; + }; + auto testFromArr = [=](ArrayAttr arr) -> bool { + return llvm::any_of(arr.getAsRange<DeviceTypeAttr>(), hasDevice); + }; + + if (ArrayAttr arr = getSeqAttr(); arr && testFromArr(arr)) + return true; + if (ArrayAttr arr = getIndependentAttr(); arr && testFromArr(arr)) + return true; + if (ArrayAttr arr = getAuto_Attr(); arr && testFromArr(arr)) + return true; + + return false; +} + +bool acc::LoopOp::hasDefaultGangWorkerVector() { + return hasVector() || getVectorValue() || hasWorker() || getWorkerValue() || + hasGang() || getGangValue(GangArgType::Num) || + getGangValue(GangArgType::Dim) || getGangValue(GangArgType::Static); +} + void acc::LoopOp::addGangOperands( MLIRContext *context, llvm::ArrayRef<DeviceType> effectiveDeviceTypes, llvm::ArrayRef<GangArgType> argTypes, mlir::ValueRange values) { _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits