[llvm-branch-commits] [llvm] 82a6d15 - Revert "Revert "[symbolizer] Empty string is not an error" (#94424)"
Author: Serge Pavlov Date: 2024-07-05T09:10:03+07:00 New Revision: 82a6d1572ff4de1491ea58eb167967350eace9fa URL: https://github.com/llvm/llvm-project/commit/82a6d1572ff4de1491ea58eb167967350eace9fa DIFF: https://github.com/llvm/llvm-project/commit/82a6d1572ff4de1491ea58eb167967350eace9fa.diff LOG: Revert "Revert "[symbolizer] Empty string is not an error" (#94424)" This reverts commit c0bb16eaf7a6c16edadfd05ba4168fa536c227e2. Added: Modified: llvm/test/tools/llvm-symbolizer/get-input-file.test llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp Removed: diff --git a/llvm/test/tools/llvm-symbolizer/get-input-file.test b/llvm/test/tools/llvm-symbolizer/get-input-file.test index 8c21816591c81..50eb051968718 100644 --- a/llvm/test/tools/llvm-symbolizer/get-input-file.test +++ b/llvm/test/tools/llvm-symbolizer/get-input-file.test @@ -1,9 +1,9 @@ # If binary input file is not specified, llvm-symbolizer assumes it is the first # item in the command. -# No input items at all, complain about missing input file. +# No input items at all. Report an unknown line, but do not produce any output on stderr. RUN: echo | llvm-symbolizer 2>%t.1.err | FileCheck %s --check-prefix=NOSOURCE -RUN: FileCheck --input-file=%t.1.err --check-prefix=NOFILE %s +RUN: FileCheck --input-file=%t.1.err --implicit-check-not={{.}} --allow-empty %s # Only one input item, complain about missing addresses. RUN: llvm-symbolizer "foo" 2>%t.2.err | FileCheck %s --check-prefix=NOSOURCE @@ -32,8 +32,6 @@ RUN: FileCheck --input-file=%t.7.err --check-prefix=BAD-QUOTE %s NOSOURCE: ?? NOSOURCE-NEXT: ??:0:0 -NOFILE: error: no input filename has been specified - NOADDR: error: 'foo': no module offset has been specified NOTFOUND: error: 'foo': [[MSG]] diff --git a/llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp b/llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp index b98bdbc388faf..6d7953f3109a5 100644 --- a/llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp +++ b/llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp @@ -337,6 +337,14 @@ static void symbolizeInput(const opt::InputArgList , object::BuildID BuildID(IncomingBuildID.begin(), IncomingBuildID.end()); uint64_t Offset = 0; StringRef Symbol; + + // An empty input string may be used to check if the process is alive and + // responding to input. Do not emit a message on stderr in this case but + // respond on stdout. + if (InputString.empty()) { +printUnknownLineInfo(ModuleName, Printer); +return; + } if (Error E = parseCommand(Args.getLastArgValue(OPT_obj_EQ), IsAddr2Line, StringRef(InputString), Cmd, ModuleName, BuildID, Symbol, Offset)) { ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AArch64][PAC] Support BLRA* instructions in SLS Hardening pass (PR #97605)
https://github.com/atrosinenko updated https://github.com/llvm/llvm-project/pull/97605 >From 84e7eb3c36b99ac7f673d9a7ad0c88c469f7f45d Mon Sep 17 00:00:00 2001 From: Anatoly Trosinenko Date: Mon, 1 Jul 2024 20:13:54 +0300 Subject: [PATCH 1/2] [AArch64][PAC] Support BLRA* instructions in SLS Hardening pass Make SLS Hardening pass handle BLRA* instructions the same way it handles BLR. The thunk names have the form __llvm_slsblr_thunk_xNfor BLR thunks __llvm_slsblr_thunk_(aaz|abz)_xN for BLRAAZ and BLRABZ thunks __llvm_slsblr_thunk_(aa|ab)_xN_xM for BLRAA and BLRAB thunks Now there are about 1800 possible thunk names, so do not rely on linear thunk function's name lookup and parse the name instead. --- .../Target/AArch64/AArch64SLSHardening.cpp| 326 -- .../speculation-hardening-sls-blra.mir| 210 +++ 2 files changed, 432 insertions(+), 104 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/speculation-hardening-sls-blra.mir diff --git a/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp b/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp index 00ba31b3e500d..4ae2e48af337f 100644 --- a/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp +++ b/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp @@ -13,6 +13,7 @@ #include "AArch64InstrInfo.h" #include "AArch64Subtarget.h" +#include "llvm/ADT/StringSwitch.h" #include "llvm/CodeGen/IndirectThunks.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" @@ -23,6 +24,7 @@ #include "llvm/IR/DebugLoc.h" #include "llvm/Pass.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FormatVariadic.h" #include "llvm/Target/TargetMachine.h" #include @@ -32,17 +34,103 @@ using namespace llvm; #define AARCH64_SLS_HARDENING_NAME "AArch64 sls hardening pass" -static const char SLSBLRNamePrefix[] = "__llvm_slsblr_thunk_"; +// Common name prefix of all thunks generated by this pass. +// +// The generic form is +// __llvm_slsblr_thunk_xNfor BLR thunks +// __llvm_slsblr_thunk_(aaz|abz)_xN for BLRAAZ and BLRABZ thunks +// __llvm_slsblr_thunk_(aa|ab)_xN_xM for BLRAA and BLRAB thunks +static constexpr StringRef CommonNamePrefix = "__llvm_slsblr_thunk_"; namespace { -// Set of inserted thunks: bitmask with bits corresponding to -// indexes in SLSBLRThunks array. -typedef uint32_t ThunksSet; +struct ThunkKind { + enum ThunkKindId { +ThunkBR, +ThunkBRAA, +ThunkBRAB, +ThunkBRAAZ, +ThunkBRABZ, + }; + + ThunkKindId Id; + StringRef NameInfix; + bool HasXmOperand; + bool NeedsPAuth; + + // Opcode to perform indirect jump from inside the thunk. + unsigned BROpcode; + + static const ThunkKind BR; + static const ThunkKind BRAA; + static const ThunkKind BRAB; + static const ThunkKind BRAAZ; + static const ThunkKind BRABZ; +}; + +// Set of inserted thunks. +class ThunksSet { +public: + static constexpr unsigned NumXRegisters = 32; + + // Given Xn register, returns n. + static unsigned indexOfXReg(Register Xn); + // Given n, returns Xn register. + static Register xRegByIndex(unsigned N); + + ThunksSet |=(const ThunksSet ) { +BLRThunks |= Other.BLRThunks; +BLRAAZThunks |= Other.BLRAAZThunks; +BLRABZThunks |= Other.BLRABZThunks; +for (unsigned I = 0; I < NumXRegisters; ++I) + BLRAAThunks[I] |= Other.BLRAAThunks[I]; +for (unsigned I = 0; I < NumXRegisters; ++I) + BLRABThunks[I] |= Other.BLRABThunks[I]; + +return *this; + } + + bool get(ThunkKind::ThunkKindId Kind, Register Xn, Register Xm) { +uint32_t XnBit = 1u << indexOfXReg(Xn); +return getBitmask(Kind, Xm) & XnBit; + } + + void set(ThunkKind::ThunkKindId Kind, Register Xn, Register Xm) { +uint32_t XnBit = 1u << indexOfXReg(Xn); +getBitmask(Kind, Xm) |= XnBit; + } + +private: + // Bitmasks representing operands used, with n-th bit corresponding to Xn + // register operand. If the instruction has a second operand (Xm), an array + // of bitmasks is used, indexed by m. + // Indexes corresponding to the forbidden x16, x17 and x30 registers are + // always unset, for simplicity there are no holes. + uint32_t BLRThunks = 0; + uint32_t BLRAAZThunks = 0; + uint32_t BLRABZThunks = 0; + uint32_t BLRAAThunks[NumXRegisters] = {}; + uint32_t BLRABThunks[NumXRegisters] = {}; + + uint32_t (ThunkKind::ThunkKindId Kind, Register Xm) { +switch (Kind) { +case ThunkKind::ThunkBR: + return BLRThunks; +case ThunkKind::ThunkBRAAZ: + return BLRAAZThunks; +case ThunkKind::ThunkBRABZ: + return BLRABZThunks; +case ThunkKind::ThunkBRAA: + return BLRAAThunks[indexOfXReg(Xm)]; +case ThunkKind::ThunkBRAB: + return BLRABThunks[indexOfXReg(Xm)]; +} + } +}; struct SLSHardeningInserter : ThunkInserter { public: - const char *getThunkPrefix() { return SLSBLRNamePrefix; } + const char *getThunkPrefix() { return CommonNamePrefix.data(); } bool mayUseThunk(const
[llvm-branch-commits] [llvm] [AArch64][PAC] Support BLRA* instructions in SLS Hardening pass (PR #97605)
https://github.com/atrosinenko updated https://github.com/llvm/llvm-project/pull/97605 >From 84e7eb3c36b99ac7f673d9a7ad0c88c469f7f45d Mon Sep 17 00:00:00 2001 From: Anatoly Trosinenko Date: Mon, 1 Jul 2024 20:13:54 +0300 Subject: [PATCH] [AArch64][PAC] Support BLRA* instructions in SLS Hardening pass Make SLS Hardening pass handle BLRA* instructions the same way it handles BLR. The thunk names have the form __llvm_slsblr_thunk_xNfor BLR thunks __llvm_slsblr_thunk_(aaz|abz)_xN for BLRAAZ and BLRABZ thunks __llvm_slsblr_thunk_(aa|ab)_xN_xM for BLRAA and BLRAB thunks Now there are about 1800 possible thunk names, so do not rely on linear thunk function's name lookup and parse the name instead. --- .../Target/AArch64/AArch64SLSHardening.cpp| 326 -- .../speculation-hardening-sls-blra.mir| 210 +++ 2 files changed, 432 insertions(+), 104 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/speculation-hardening-sls-blra.mir diff --git a/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp b/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp index 00ba31b3e500d..4ae2e48af337f 100644 --- a/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp +++ b/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp @@ -13,6 +13,7 @@ #include "AArch64InstrInfo.h" #include "AArch64Subtarget.h" +#include "llvm/ADT/StringSwitch.h" #include "llvm/CodeGen/IndirectThunks.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" @@ -23,6 +24,7 @@ #include "llvm/IR/DebugLoc.h" #include "llvm/Pass.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FormatVariadic.h" #include "llvm/Target/TargetMachine.h" #include @@ -32,17 +34,103 @@ using namespace llvm; #define AARCH64_SLS_HARDENING_NAME "AArch64 sls hardening pass" -static const char SLSBLRNamePrefix[] = "__llvm_slsblr_thunk_"; +// Common name prefix of all thunks generated by this pass. +// +// The generic form is +// __llvm_slsblr_thunk_xNfor BLR thunks +// __llvm_slsblr_thunk_(aaz|abz)_xN for BLRAAZ and BLRABZ thunks +// __llvm_slsblr_thunk_(aa|ab)_xN_xM for BLRAA and BLRAB thunks +static constexpr StringRef CommonNamePrefix = "__llvm_slsblr_thunk_"; namespace { -// Set of inserted thunks: bitmask with bits corresponding to -// indexes in SLSBLRThunks array. -typedef uint32_t ThunksSet; +struct ThunkKind { + enum ThunkKindId { +ThunkBR, +ThunkBRAA, +ThunkBRAB, +ThunkBRAAZ, +ThunkBRABZ, + }; + + ThunkKindId Id; + StringRef NameInfix; + bool HasXmOperand; + bool NeedsPAuth; + + // Opcode to perform indirect jump from inside the thunk. + unsigned BROpcode; + + static const ThunkKind BR; + static const ThunkKind BRAA; + static const ThunkKind BRAB; + static const ThunkKind BRAAZ; + static const ThunkKind BRABZ; +}; + +// Set of inserted thunks. +class ThunksSet { +public: + static constexpr unsigned NumXRegisters = 32; + + // Given Xn register, returns n. + static unsigned indexOfXReg(Register Xn); + // Given n, returns Xn register. + static Register xRegByIndex(unsigned N); + + ThunksSet |=(const ThunksSet ) { +BLRThunks |= Other.BLRThunks; +BLRAAZThunks |= Other.BLRAAZThunks; +BLRABZThunks |= Other.BLRABZThunks; +for (unsigned I = 0; I < NumXRegisters; ++I) + BLRAAThunks[I] |= Other.BLRAAThunks[I]; +for (unsigned I = 0; I < NumXRegisters; ++I) + BLRABThunks[I] |= Other.BLRABThunks[I]; + +return *this; + } + + bool get(ThunkKind::ThunkKindId Kind, Register Xn, Register Xm) { +uint32_t XnBit = 1u << indexOfXReg(Xn); +return getBitmask(Kind, Xm) & XnBit; + } + + void set(ThunkKind::ThunkKindId Kind, Register Xn, Register Xm) { +uint32_t XnBit = 1u << indexOfXReg(Xn); +getBitmask(Kind, Xm) |= XnBit; + } + +private: + // Bitmasks representing operands used, with n-th bit corresponding to Xn + // register operand. If the instruction has a second operand (Xm), an array + // of bitmasks is used, indexed by m. + // Indexes corresponding to the forbidden x16, x17 and x30 registers are + // always unset, for simplicity there are no holes. + uint32_t BLRThunks = 0; + uint32_t BLRAAZThunks = 0; + uint32_t BLRABZThunks = 0; + uint32_t BLRAAThunks[NumXRegisters] = {}; + uint32_t BLRABThunks[NumXRegisters] = {}; + + uint32_t (ThunkKind::ThunkKindId Kind, Register Xm) { +switch (Kind) { +case ThunkKind::ThunkBR: + return BLRThunks; +case ThunkKind::ThunkBRAAZ: + return BLRAAZThunks; +case ThunkKind::ThunkBRABZ: + return BLRABZThunks; +case ThunkKind::ThunkBRAA: + return BLRAAThunks[indexOfXReg(Xm)]; +case ThunkKind::ThunkBRAB: + return BLRABThunks[indexOfXReg(Xm)]; +} + } +}; struct SLSHardeningInserter : ThunkInserter { public: - const char *getThunkPrefix() { return SLSBLRNamePrefix; } + const char *getThunkPrefix() { return CommonNamePrefix.data(); } bool mayUseThunk(const MachineFunction )
[llvm-branch-commits] [llvm] [AArch64][PAC] Support BLRA* instructions in SLS Hardening pass (PR #97605)
https://github.com/atrosinenko updated https://github.com/llvm/llvm-project/pull/97605 >From f49c32c8465e9e68d7345fa82ae1294cc2faf0e7 Mon Sep 17 00:00:00 2001 From: Anatoly Trosinenko Date: Mon, 1 Jul 2024 20:13:54 +0300 Subject: [PATCH] [AArch64][PAC] Support BLRA* instructions in SLS Hardening pass Make SLS Hardening pass handle BLRA* instructions the same way it handles BLR. The thunk names have the form __llvm_slsblr_thunk_xNfor BLR thunks __llvm_slsblr_thunk_(aaz|abz)_xN for BLRAAZ and BLRABZ thunks __llvm_slsblr_thunk_(aa|ab)_xN_xM for BLRAA and BLRAB thunks Now there are about 1800 possible thunk names, so do not rely on linear thunk function's name lookup and parse the name instead. --- .../Target/AArch64/AArch64SLSHardening.cpp| 326 -- .../speculation-hardening-sls-blra.mir| 210 +++ 2 files changed, 432 insertions(+), 104 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/speculation-hardening-sls-blra.mir diff --git a/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp b/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp index 24f023a3d70e7..4ae2e48af337f 100644 --- a/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp +++ b/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp @@ -13,6 +13,7 @@ #include "AArch64InstrInfo.h" #include "AArch64Subtarget.h" +#include "llvm/ADT/StringSwitch.h" #include "llvm/CodeGen/IndirectThunks.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" @@ -23,6 +24,7 @@ #include "llvm/IR/DebugLoc.h" #include "llvm/Pass.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FormatVariadic.h" #include "llvm/Target/TargetMachine.h" #include @@ -32,17 +34,103 @@ using namespace llvm; #define AARCH64_SLS_HARDENING_NAME "AArch64 sls hardening pass" -static const char SLSBLRNamePrefix[] = "__llvm_slsblr_thunk_"; +// Common name prefix of all thunks generated by this pass. +// +// The generic form is +// __llvm_slsblr_thunk_xNfor BLR thunks +// __llvm_slsblr_thunk_(aaz|abz)_xN for BLRAAZ and BLRABZ thunks +// __llvm_slsblr_thunk_(aa|ab)_xN_xM for BLRAA and BLRAB thunks +static constexpr StringRef CommonNamePrefix = "__llvm_slsblr_thunk_"; namespace { -// Set of inserted thunks: bitmask with bits corresponding to -// indexes in SLSBLRThunks array. -typedef uint32_t ThunksSet; +struct ThunkKind { + enum ThunkKindId { +ThunkBR, +ThunkBRAA, +ThunkBRAB, +ThunkBRAAZ, +ThunkBRABZ, + }; + + ThunkKindId Id; + StringRef NameInfix; + bool HasXmOperand; + bool NeedsPAuth; + + // Opcode to perform indirect jump from inside the thunk. + unsigned BROpcode; + + static const ThunkKind BR; + static const ThunkKind BRAA; + static const ThunkKind BRAB; + static const ThunkKind BRAAZ; + static const ThunkKind BRABZ; +}; + +// Set of inserted thunks. +class ThunksSet { +public: + static constexpr unsigned NumXRegisters = 32; + + // Given Xn register, returns n. + static unsigned indexOfXReg(Register Xn); + // Given n, returns Xn register. + static Register xRegByIndex(unsigned N); + + ThunksSet |=(const ThunksSet ) { +BLRThunks |= Other.BLRThunks; +BLRAAZThunks |= Other.BLRAAZThunks; +BLRABZThunks |= Other.BLRABZThunks; +for (unsigned I = 0; I < NumXRegisters; ++I) + BLRAAThunks[I] |= Other.BLRAAThunks[I]; +for (unsigned I = 0; I < NumXRegisters; ++I) + BLRABThunks[I] |= Other.BLRABThunks[I]; + +return *this; + } + + bool get(ThunkKind::ThunkKindId Kind, Register Xn, Register Xm) { +uint32_t XnBit = 1u << indexOfXReg(Xn); +return getBitmask(Kind, Xm) & XnBit; + } + + void set(ThunkKind::ThunkKindId Kind, Register Xn, Register Xm) { +uint32_t XnBit = 1u << indexOfXReg(Xn); +getBitmask(Kind, Xm) |= XnBit; + } + +private: + // Bitmasks representing operands used, with n-th bit corresponding to Xn + // register operand. If the instruction has a second operand (Xm), an array + // of bitmasks is used, indexed by m. + // Indexes corresponding to the forbidden x16, x17 and x30 registers are + // always unset, for simplicity there are no holes. + uint32_t BLRThunks = 0; + uint32_t BLRAAZThunks = 0; + uint32_t BLRABZThunks = 0; + uint32_t BLRAAThunks[NumXRegisters] = {}; + uint32_t BLRABThunks[NumXRegisters] = {}; + + uint32_t (ThunkKind::ThunkKindId Kind, Register Xm) { +switch (Kind) { +case ThunkKind::ThunkBR: + return BLRThunks; +case ThunkKind::ThunkBRAAZ: + return BLRAAZThunks; +case ThunkKind::ThunkBRABZ: + return BLRABZThunks; +case ThunkKind::ThunkBRAA: + return BLRAAThunks[indexOfXReg(Xm)]; +case ThunkKind::ThunkBRAB: + return BLRABThunks[indexOfXReg(Xm)]; +} + } +}; struct SLSHardeningInserter : ThunkInserter { public: - const char *getThunkPrefix() { return SLSBLRNamePrefix; } + const char *getThunkPrefix() { return CommonNamePrefix.data(); } bool mayUseThunk(const MachineFunction )
[llvm-branch-commits] [flang] [mlir] [Flang][OpenMP] Add lowering support for DO SIMD (PR #97718)
llvmbot wrote: @llvm/pr-subscribers-flang-openmp @llvm/pr-subscribers-mlir-openmp Author: Sergio Afonso (skatrak) Changes This patch adds support for lowering 'DO SIMD' constructs to MLIR. SIMD information is now stored in an `omp.simd` loop wrapper, which is currently ignored by the OpenMP dialect to LLVM IR translation stage. The end result is that runtime behavior of compiled 'DO SIMD' constructs does not change after this patch, so 'DO SIMD' still runs like 'DO' (i.e. SIMD width = 1). However, all of the required information is now present in the resulting MLIR representation. To avoid confusion, the previous wsloop-simd.f90 lit test is renamed to wsloop-schedule.f90 and a new wsloop-simd.f90 test is created to check the addition of SIMD clauses to the `omp.simd` operation produced when a 'DO SIMD' construct is lowered to MLIR. --- Full diff: https://github.com/llvm/llvm-project/pull/97718.diff 10 Files Affected: - (modified) flang/lib/Lower/OpenMP/OpenMP.cpp (+36-13) - (removed) flang/test/Lower/OpenMP/Todo/omp-do-simd-aligned.f90 (-16) - (modified) flang/test/Lower/OpenMP/Todo/omp-do-simd-linear.f90 (+1-1) - (removed) flang/test/Lower/OpenMP/Todo/omp-do-simd-safelen.f90 (-14) - (removed) flang/test/Lower/OpenMP/Todo/omp-do-simd-simdlen.f90 (-14) - (modified) flang/test/Lower/OpenMP/if-clause.f90 (+31) - (modified) flang/test/Lower/OpenMP/loop-compound.f90 (+3) - (added) flang/test/Lower/OpenMP/wsloop-schedule.f90 (+37) - (modified) flang/test/Lower/OpenMP/wsloop-simd.f90 (+42-32) - (modified) mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp (+3) ``diff diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index 1830e31349cfb..9c23888a87173 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -2060,19 +2060,42 @@ static void genCompositeDoSimd(lower::AbstractConverter , const ConstructQueue , ConstructQueue::iterator item, DataSharingProcessor ) { - ClauseProcessor cp(converter, semaCtx, item->clauses); - cp.processTODO(loc, - llvm::omp::OMPD_do_simd); - // TODO: Add support for vectorization - add vectorization hints inside loop - // body. - // OpenMP standard does not specify the length of vector instructions. - // Currently we safely assume that for !$omp do simd pragma the SIMD length - // is equal to 1 (i.e. we generate standard workshare loop). - // When support for vectorization is enabled, then we need to add handling of - // if clause. Currently if clause can be skipped because we always assume - // SIMD length = 1. - genStandaloneDo(converter, symTable, semaCtx, eval, loc, queue, item, dsp); + lower::StatementContext stmtCtx; + + // Clause processing. + mlir::omp::WsloopClauseOps wsloopClauseOps; + llvm::SmallVector wsloopReductionSyms; + llvm::SmallVector wsloopReductionTypes; + genWsloopClauses(converter, semaCtx, stmtCtx, item->clauses, loc, + wsloopClauseOps, wsloopReductionTypes, wsloopReductionSyms); + + mlir::omp::SimdClauseOps simdClauseOps; + genSimdClauses(converter, semaCtx, item->clauses, loc, simdClauseOps); + + mlir::omp::LoopNestClauseOps loopNestClauseOps; + llvm::SmallVector iv; + genLoopNestClauses(converter, semaCtx, eval, item->clauses, loc, + loopNestClauseOps, iv); + + // Operation creation. + auto wsloopOp = + genWsloopWrapperOp(converter, semaCtx, eval, loc, wsloopClauseOps, + wsloopReductionSyms, wsloopReductionTypes); + + auto simdOp = genSimdWrapperOp(converter, semaCtx, eval, loc, simdClauseOps); + + // Construct wrapper entry block list and associated symbols. It is important + // that the symbol and block argument order match, so that the symbol-value + // bindings created are correct. + // TODO: Add omp.wsloop private and omp.simd private and reduction args. + auto wrapperArgs = llvm::to_vector(llvm::concat( + wsloopOp.getRegion().getArguments(), simdOp.getRegion().getArguments())); + + assert(wsloopReductionSyms.size() == wrapperArgs.size() && + "Number of symbols and wrapper block arguments must match"); + genLoopNestOp(converter, symTable, semaCtx, eval, loc, queue, item, +loopNestClauseOps, iv, wsloopReductionSyms, wrapperArgs, +llvm::omp::Directive::OMPD_do_simd, dsp); } static void genCompositeTaskloopSimd( diff --git a/flang/test/Lower/OpenMP/Todo/omp-do-simd-aligned.f90 b/flang/test/Lower/OpenMP/Todo/omp-do-simd-aligned.f90 deleted file mode 100644 index b62c54182442a..0 --- a/flang/test/Lower/OpenMP/Todo/omp-do-simd-aligned.f90 +++ /dev/null @@ -1,16 +0,0 @@ -! This test checks lowering of OpenMP do simd aligned() pragma - -! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s -! RUN: %not_todo_cmd
[llvm-branch-commits] [flang] [mlir] [Flang][OpenMP] Add lowering support for DO SIMD (PR #97718)
https://github.com/skatrak created https://github.com/llvm/llvm-project/pull/97718 This patch adds support for lowering 'DO SIMD' constructs to MLIR. SIMD information is now stored in an `omp.simd` loop wrapper, which is currently ignored by the OpenMP dialect to LLVM IR translation stage. The end result is that runtime behavior of compiled 'DO SIMD' constructs does not change after this patch, so 'DO SIMD' still runs like 'DO' (i.e. SIMD width = 1). However, all of the required information is now present in the resulting MLIR representation. To avoid confusion, the previous wsloop-simd.f90 lit test is renamed to wsloop-schedule.f90 and a new wsloop-simd.f90 test is created to check the addition of SIMD clauses to the `omp.simd` operation produced when a 'DO SIMD' construct is lowered to MLIR. >From cca42d31ed43dee5521605f50f3fed380d034f70 Mon Sep 17 00:00:00 2001 From: Sergio Afonso Date: Thu, 4 Jul 2024 12:56:43 +0100 Subject: [PATCH] [Flang][OpenMP] Add lowering support for DO SIMD This patch adds support for lowering 'DO SIMD' constructs to MLIR. SIMD information is now stored in an `omp.simd` loop wrapper, which is currently ignored by the OpenMP dialect to LLVM IR translation stage. The end result is that runtime behavior of compiled 'DO SIMD' constructs does not change after this patch, so 'DO SIMD' still runs like 'DO' (i.e. SIMD width = 1). However, all of the required information is now present in the resulting MLIR representation. To avoid confusion, the previous wsloop-simd.f90 lit test is renamed to wsloop-schedule.f90 and a new wsloop-simd.f90 test is created to check the addition of SIMD clauses to the `omp.simd` operation produced when a 'DO SIMD' construct is lowered to MLIR. --- flang/lib/Lower/OpenMP/OpenMP.cpp | 49 .../Lower/OpenMP/Todo/omp-do-simd-aligned.f90 | 16 .../Lower/OpenMP/Todo/omp-do-simd-linear.f90 | 2 +- .../Lower/OpenMP/Todo/omp-do-simd-safelen.f90 | 14 .../Lower/OpenMP/Todo/omp-do-simd-simdlen.f90 | 14 flang/test/Lower/OpenMP/if-clause.f90 | 31 flang/test/Lower/OpenMP/loop-compound.f90 | 3 + flang/test/Lower/OpenMP/wsloop-schedule.f90 | 37 ++ flang/test/Lower/OpenMP/wsloop-simd.f90 | 74 +++ .../OpenMP/OpenMPToLLVMIRTranslation.cpp | 3 + 10 files changed, 153 insertions(+), 90 deletions(-) delete mode 100644 flang/test/Lower/OpenMP/Todo/omp-do-simd-aligned.f90 delete mode 100644 flang/test/Lower/OpenMP/Todo/omp-do-simd-safelen.f90 delete mode 100644 flang/test/Lower/OpenMP/Todo/omp-do-simd-simdlen.f90 create mode 100644 flang/test/Lower/OpenMP/wsloop-schedule.f90 diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index 1830e31349cfb..9c23888a87173 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -2060,19 +2060,42 @@ static void genCompositeDoSimd(lower::AbstractConverter , const ConstructQueue , ConstructQueue::iterator item, DataSharingProcessor ) { - ClauseProcessor cp(converter, semaCtx, item->clauses); - cp.processTODO(loc, - llvm::omp::OMPD_do_simd); - // TODO: Add support for vectorization - add vectorization hints inside loop - // body. - // OpenMP standard does not specify the length of vector instructions. - // Currently we safely assume that for !$omp do simd pragma the SIMD length - // is equal to 1 (i.e. we generate standard workshare loop). - // When support for vectorization is enabled, then we need to add handling of - // if clause. Currently if clause can be skipped because we always assume - // SIMD length = 1. - genStandaloneDo(converter, symTable, semaCtx, eval, loc, queue, item, dsp); + lower::StatementContext stmtCtx; + + // Clause processing. + mlir::omp::WsloopClauseOps wsloopClauseOps; + llvm::SmallVector wsloopReductionSyms; + llvm::SmallVector wsloopReductionTypes; + genWsloopClauses(converter, semaCtx, stmtCtx, item->clauses, loc, + wsloopClauseOps, wsloopReductionTypes, wsloopReductionSyms); + + mlir::omp::SimdClauseOps simdClauseOps; + genSimdClauses(converter, semaCtx, item->clauses, loc, simdClauseOps); + + mlir::omp::LoopNestClauseOps loopNestClauseOps; + llvm::SmallVector iv; + genLoopNestClauses(converter, semaCtx, eval, item->clauses, loc, + loopNestClauseOps, iv); + + // Operation creation. + auto wsloopOp = + genWsloopWrapperOp(converter, semaCtx, eval, loc, wsloopClauseOps, + wsloopReductionSyms, wsloopReductionTypes); + + auto simdOp = genSimdWrapperOp(converter, semaCtx, eval, loc, simdClauseOps); + + // Construct wrapper entry block list and associated symbols. It is important + // that the symbol and block argument order match, so that the symbol-value + // bindings created are correct. +
[llvm-branch-commits] [llvm] [AMDGPU][SILoadStoreOptimizer] Merge constrained sloads (PR #96162)
@@ -183,10 +183,10 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) { ; GFX940-LABEL: local_atomic_fadd_v2bf16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT:s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT:s_load_dwordx2 s[2:3], s[0:1], 0x24 cdevadas wrote: Opened https://github.com/llvm/llvm-project/issues/97715. https://github.com/llvm/llvm-project/pull/96162 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [flang] [Flang][OpenMP] NFC: Share DataSharingProcessor creation logic for all loop directives (PR #97565)
https://github.com/mjklemm approved this pull request. LGTM https://github.com/llvm/llvm-project/pull/97565 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [flang] [Flang][OpenMP] NFC: Remove unused argument for omp.target lowering (PR #97564)
https://github.com/mjklemm approved this pull request. LGTM https://github.com/llvm/llvm-project/pull/97564 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [flang] [Flang][OpenMP] Prevent allocas from being inserted into loop wrappers (PR #97563)
https://github.com/tblah approved this pull request. Code changes look good. I would prefer to see a lit test for this code path. It is good enough if this is used by a test added later in your PR stack. Otherwise, please could you add a lit test which hits this condition so that we can catch if this gets broken by any changes in the future. https://github.com/llvm/llvm-project/pull/97563 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][SILoadStoreOptimizer] Merge constrained sloads (PR #96162)
@@ -183,10 +183,10 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) { ; GFX940-LABEL: local_atomic_fadd_v2bf16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT:s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT:s_load_dwordx2 s[2:3], s[0:1], 0x24 arsenm wrote: Can you open an issue for this https://github.com/llvm/llvm-project/pull/96162 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [flang] [Flang][OpenMP] Refactor loop-related lowering for composite support (PR #97566)
@@ -1492,7 +1466,7 @@ genParallelOp(lower::AbstractConverter , lower::SymMap , firOpBuilder.createBlock(, /*insertPt=*/{}, allRegionArgTypes, allRegionArgLocs); -llvm::SmallVector allSymbols = reductionSyms; +llvm::SmallVector allSymbols(reductionSyms); tblah wrote: ultra-nit, feel free to ignore Personally I don't like using `()` here because it can parse (to humans and computers) like a function declaration. ```suggestion llvm::SmallVector allSymbols{reductionSyms}; ``` https://github.com/llvm/llvm-project/pull/97566 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [flang] [Flang][OpenMP] Refactor loop-related lowering for composite support (PR #97566)
https://github.com/tblah commented: This looks good overall. Do you expect there to be a lot more wrapper operations in the near future? If so, they look like there is a common pattern that could be further abstracted. Something like ```c++ template static OP genWrapperOp(..., llvm::ArrayRef extraBlockArgTypes) { fir::FirOpBuilder = ...; auto op = firOpBuilder.create(loc, clauseOps); llvm::SmallVector blockArgLocs(extraBlockArgTypes.size(), loc); firOpBuilder.createBlock(...) firOpBuilder.setInsertionPoint(...) return op; } ``` https://github.com/llvm/llvm-project/pull/97566 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [flang] [Flang][OpenMP] Refactor loop-related lowering for composite support (PR #97566)
https://github.com/tblah edited https://github.com/llvm/llvm-project/pull/97566 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [flang] [Flang][OpenMP] Refactor loop-related lowering for composite support (PR #97566)
@@ -518,8 +518,8 @@ struct OpWithBodyGenInfo { } OpWithBodyGenInfo & - setReductions(llvm::SmallVectorImpl *value1, -llvm::SmallVectorImpl *value2) { + setReductions(llvm::ArrayRef *value1, +llvm::ArrayRef *value2) { tblah wrote: Can we pass these by value instead of as pointers now? https://github.com/llvm/llvm-project/pull/97566 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [RISCV] Support select optimization (PR #80124)
wangpc-pp wrote: https://github.com/llvm/llvm-project/pull/97708 is splitted out for adding `FeaturePredictableSelectIsExpensive`. https://github.com/llvm/llvm-project/pull/80124 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [flang] [Flang][OpenMP] NFC: Share DataSharingProcessor creation logic for all loop directives (PR #97565)
https://github.com/tblah approved this pull request. LGTM, thanks! https://github.com/llvm/llvm-project/pull/97565 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Add subtarget feature for memory atomic fadd f64 (PR #96444)
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/96444 >From 4590c05051bbf57f0b269b2561aa1a7c74f06fbc Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sun, 23 Jun 2024 17:07:53 +0200 Subject: [PATCH] AMDGPU: Add subtarget feature for memory atomic fadd f64 --- llvm/lib/Target/AMDGPU/AMDGPU.td | 21 ++--- llvm/lib/Target/AMDGPU/BUFInstructions.td | 10 ++ llvm/lib/Target/AMDGPU/FLATInstructions.td | 6 +++--- llvm/lib/Target/AMDGPU/GCNSubtarget.h | 10 +++--- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 2 +- 5 files changed, 31 insertions(+), 18 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index bea233bfb27bd..94e8e77b3c052 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -788,6 +788,13 @@ def FeatureFlatAtomicFaddF32Inst "Has flat_atomic_add_f32 instruction" >; +def FeatureFlatBufferGlobalAtomicFaddF64Inst + : SubtargetFeature<"flat-buffer-global-fadd-f64-inst", + "HasFlatBufferGlobalAtomicFaddF64Inst", + "true", + "Has flat, buffer, and global instructions for f64 atomic fadd" +>; + def FeatureMemoryAtomicFAddF32DenormalSupport : SubtargetFeature<"memory-atomic-fadd-f32-denormal-support", "HasMemoryAtomicFaddF32DenormalSupport", @@ -1390,7 +1397,8 @@ def FeatureISAVersion9_0_A : FeatureSet< FeatureBackOffBarrier, FeatureKernargPreload, FeatureAtomicFMinFMaxF64GlobalInsts, - FeatureAtomicFMinFMaxF64FlatInsts + FeatureAtomicFMinFMaxF64FlatInsts, + FeatureFlatBufferGlobalAtomicFaddF64Inst ])>; def FeatureISAVersion9_0_C : FeatureSet< @@ -1435,7 +1443,8 @@ def FeatureISAVersion9_4_Common : FeatureSet< FeatureAtomicFMinFMaxF64GlobalInsts, FeatureAtomicFMinFMaxF64FlatInsts, FeatureAgentScopeFineGrainedRemoteMemoryAtomics, - FeatureMemoryAtomicFAddF32DenormalSupport + FeatureMemoryAtomicFAddF32DenormalSupport, + FeatureFlatBufferGlobalAtomicFaddF64Inst ]>; def FeatureISAVersion9_4_0 : FeatureSet< @@ -1932,11 +1941,9 @@ def isGFX12Plus : def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">, AssemblerPredicate<(all_of FeatureFlatAddressSpace)>; - -def HasBufferFlatGlobalAtomicsF64 : // FIXME: Rename to show it's only for fadd - Predicate<"Subtarget->hasBufferFlatGlobalAtomicsF64()">, - // FIXME: This is too coarse, and working around using pseudo's predicates on real instruction. - AssemblerPredicate<(any_of FeatureGFX90AInsts, FeatureGFX10Insts, FeatureSouthernIslands, FeatureSeaIslands)>; +def HasFlatBufferGlobalAtomicFaddF64Inst : + Predicate<"Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst()">, + AssemblerPredicate<(any_of FeatureFlatBufferGlobalAtomicFaddF64Inst)>; def HasAtomicFMinFMaxF32GlobalInsts : Predicate<"Subtarget->hasAtomicFMinFMaxF32GlobalInsts()">, diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index 3b8d94b744000..a904c8483dbf5 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -1312,14 +1312,16 @@ let SubtargetPredicate = isGFX90APlus in { } } // End SubtargetPredicate = isGFX90APlus -let SubtargetPredicate = HasBufferFlatGlobalAtomicsF64 in { +let SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst in { defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_add_f64", VReg_64, f64>; +} // End SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst +let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in { // Note the names can be buffer_atomic_fmin_x2/buffer_atomic_fmax_x2 // depending on some subtargets. defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_min_f64", VReg_64, f64>; defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_max_f64", VReg_64, f64>; -} // End SubtargetPredicate = HasBufferFlatGlobalAtomicsF64 +} def BUFFER_INV : MUBUF_Invalidate<"buffer_inv"> { let SubtargetPredicate = isGFX940Plus; @@ -1836,9 +1838,9 @@ let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16Insts in { defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16", ["ret"]>; } // End SubtargetPredicate = HasAtomicBufferGlobalPkAddF16Insts -let SubtargetPredicate = HasBufferFlatGlobalAtomicsF64 in { +let SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst in { defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f64, "BUFFER_ATOMIC_ADD_F64">; -} // End SubtargetPredicate = HasBufferFlatGlobalAtomicsF64 +} // End SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in { defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f64, "BUFFER_ATOMIC_MIN_F64">; diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 4bf8f20269a15..16dc019ede810 100644 ---
[llvm-branch-commits] [llvm] AMDGPU: Add subtarget feature for global atomic fadd denormal support (PR #96443)
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/96443 >From b29740ba988e2286a7edc67b5a96c5dce0e600a6 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sun, 23 Jun 2024 16:44:08 +0200 Subject: [PATCH 1/3] AMDGPU: Add subtarget feature for global atomic fadd denormal support Not sure what the behavior for gfx90a is. The SPG says it always flushes. The instruction documentation says it does not. --- llvm/lib/Target/AMDGPU/AMDGPU.td | 14 -- llvm/lib/Target/AMDGPU/GCNSubtarget.h | 7 +++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 3f35db8883716..51c077598df74 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -788,6 +788,13 @@ def FeatureFlatAtomicFaddF32Inst "Has flat_atomic_add_f32 instruction" >; +def FeatureMemoryAtomicFaddF32DenormalSupport + : SubtargetFeature<"memory-atomic-fadd-f32-denormal-support", + "HasAtomicMemoryAtomicFaddF32DenormalSupport", + "true", + "global/flat/buffer atomic fadd for float supports denormal handling" +>; + def FeatureAgentScopeFineGrainedRemoteMemoryAtomics : SubtargetFeature<"agent-scope-fine-grained-remote-memory-atomics", "HasAgentScopeFineGrainedRemoteMemoryAtomics", @@ -1427,7 +1434,8 @@ def FeatureISAVersion9_4_Common : FeatureSet< FeatureKernargPreload, FeatureAtomicFMinFMaxF64GlobalInsts, FeatureAtomicFMinFMaxF64FlatInsts, - FeatureAgentScopeFineGrainedRemoteMemoryAtomics + FeatureAgentScopeFineGrainedRemoteMemoryAtomics, + FeatureMemoryAtomicFaddF32DenormalSupport ]>; def FeatureISAVersion9_4_0 : FeatureSet< @@ -1631,7 +1639,9 @@ def FeatureISAVersion12 : FeatureSet< FeatureScalarDwordx3Loads, FeatureDPPSrc1SGPR, FeatureMaxHardClauseLength32, - Feature1_5xVGPRs]>; + Feature1_5xVGPRs, + FeatureMemoryAtomicFaddF32DenormalSupport]>; + ]>; def FeatureISAVersion12_Generic: FeatureSet< !listconcat(FeatureISAVersion12.Features, diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 9e2a316a9ed28..db0b2b67a0388 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -167,6 +167,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool HasAtomicFlatPkAdd16Insts = false; bool HasAtomicFaddRtnInsts = false; bool HasAtomicFaddNoRtnInsts = false; + bool HasAtomicMemoryAtomicFaddF32DenormalSupport = false; bool HasAtomicBufferGlobalPkAddF16NoRtnInsts = false; bool HasAtomicBufferGlobalPkAddF16Insts = false; bool HasAtomicCSubNoRtnInsts = false; @@ -872,6 +873,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; } + /// \return true if the target's flat, global, and buffer atomic fadd for + /// float supports denormal handling. + bool hasMemoryAtomicFaddF32DenormalSupport() const { +return HasAtomicMemoryAtomicFaddF32DenormalSupport; + } + /// \return true if atomic operations targeting fine-grained memory work /// correctly at device scope, in allocations in host or peer PCIe device /// memory. >From 84819ef58eb12bd3606fbf250b516793dbf36add Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 24 Jun 2024 12:10:37 +0200 Subject: [PATCH 2/3] Add to gfx11. RDNA 3 manual says "Floating-point addition handles NAN/INF/denorm" thought I'm not sure I trust it. --- llvm/lib/Target/AMDGPU/AMDGPU.td | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 51c077598df74..370992eb81ff3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1547,7 +1547,8 @@ def FeatureISAVersion11_Common : FeatureSet< FeatureFlatAtomicFaddF32Inst, FeatureImageInsts, FeaturePackedTID, - FeatureVcmpxPermlaneHazard]>; + FeatureVcmpxPermlaneHazard, + FeatureMemoryAtomicFaddF32DenormalSupport]>; // There are few workarounds that need to be // added to all targets. This pessimizes codegen @@ -1640,7 +1641,7 @@ def FeatureISAVersion12 : FeatureSet< FeatureDPPSrc1SGPR, FeatureMaxHardClauseLength32, Feature1_5xVGPRs, - FeatureMemoryAtomicFaddF32DenormalSupport]>; + FeatureMemoryAtomicFaddF32DenormalSupport ]>; def FeatureISAVersion12_Generic: FeatureSet< >From 5ef29a5699514b2c3956d1f44b8a0393b7c87004 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 26 Jun 2024 11:30:51 +0200 Subject: [PATCH 3/3] Rename --- llvm/lib/Target/AMDGPU/AMDGPU.td | 10 +- llvm/lib/Target/AMDGPU/GCNSubtarget.h | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 370992eb81ff3..bea233bfb27bd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@
[llvm-branch-commits] [llvm] AMDGPU: Add subtarget feature for global atomic fadd denormal support (PR #96443)
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/96443 >From b633b82d58ecc55bb88eaefd87d5b3030799f2c0 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sun, 23 Jun 2024 16:44:08 +0200 Subject: [PATCH 1/3] AMDGPU: Add subtarget feature for global atomic fadd denormal support Not sure what the behavior for gfx90a is. The SPG says it always flushes. The instruction documentation says it does not. --- llvm/lib/Target/AMDGPU/AMDGPU.td | 14 -- llvm/lib/Target/AMDGPU/GCNSubtarget.h | 7 +++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 3f35db8883716d..51c077598df749 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -788,6 +788,13 @@ def FeatureFlatAtomicFaddF32Inst "Has flat_atomic_add_f32 instruction" >; +def FeatureMemoryAtomicFaddF32DenormalSupport + : SubtargetFeature<"memory-atomic-fadd-f32-denormal-support", + "HasAtomicMemoryAtomicFaddF32DenormalSupport", + "true", + "global/flat/buffer atomic fadd for float supports denormal handling" +>; + def FeatureAgentScopeFineGrainedRemoteMemoryAtomics : SubtargetFeature<"agent-scope-fine-grained-remote-memory-atomics", "HasAgentScopeFineGrainedRemoteMemoryAtomics", @@ -1427,7 +1434,8 @@ def FeatureISAVersion9_4_Common : FeatureSet< FeatureKernargPreload, FeatureAtomicFMinFMaxF64GlobalInsts, FeatureAtomicFMinFMaxF64FlatInsts, - FeatureAgentScopeFineGrainedRemoteMemoryAtomics + FeatureAgentScopeFineGrainedRemoteMemoryAtomics, + FeatureMemoryAtomicFaddF32DenormalSupport ]>; def FeatureISAVersion9_4_0 : FeatureSet< @@ -1631,7 +1639,9 @@ def FeatureISAVersion12 : FeatureSet< FeatureScalarDwordx3Loads, FeatureDPPSrc1SGPR, FeatureMaxHardClauseLength32, - Feature1_5xVGPRs]>; + Feature1_5xVGPRs, + FeatureMemoryAtomicFaddF32DenormalSupport]>; + ]>; def FeatureISAVersion12_Generic: FeatureSet< !listconcat(FeatureISAVersion12.Features, diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 9e2a316a9ed28d..db0b2b67a03884 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -167,6 +167,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool HasAtomicFlatPkAdd16Insts = false; bool HasAtomicFaddRtnInsts = false; bool HasAtomicFaddNoRtnInsts = false; + bool HasAtomicMemoryAtomicFaddF32DenormalSupport = false; bool HasAtomicBufferGlobalPkAddF16NoRtnInsts = false; bool HasAtomicBufferGlobalPkAddF16Insts = false; bool HasAtomicCSubNoRtnInsts = false; @@ -872,6 +873,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; } + /// \return true if the target's flat, global, and buffer atomic fadd for + /// float supports denormal handling. + bool hasMemoryAtomicFaddF32DenormalSupport() const { +return HasAtomicMemoryAtomicFaddF32DenormalSupport; + } + /// \return true if atomic operations targeting fine-grained memory work /// correctly at device scope, in allocations in host or peer PCIe device /// memory. >From 55ee6de3f2ac99d981ed57c3f933cc18e9a738a2 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 24 Jun 2024 12:10:37 +0200 Subject: [PATCH 2/3] Add to gfx11. RDNA 3 manual says "Floating-point addition handles NAN/INF/denorm" thought I'm not sure I trust it. --- llvm/lib/Target/AMDGPU/AMDGPU.td | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 51c077598df749..370992eb81ff33 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1547,7 +1547,8 @@ def FeatureISAVersion11_Common : FeatureSet< FeatureFlatAtomicFaddF32Inst, FeatureImageInsts, FeaturePackedTID, - FeatureVcmpxPermlaneHazard]>; + FeatureVcmpxPermlaneHazard, + FeatureMemoryAtomicFaddF32DenormalSupport]>; // There are few workarounds that need to be // added to all targets. This pessimizes codegen @@ -1640,7 +1641,7 @@ def FeatureISAVersion12 : FeatureSet< FeatureDPPSrc1SGPR, FeatureMaxHardClauseLength32, Feature1_5xVGPRs, - FeatureMemoryAtomicFaddF32DenormalSupport]>; + FeatureMemoryAtomicFaddF32DenormalSupport ]>; def FeatureISAVersion12_Generic: FeatureSet< >From 573e7bcdedc1fd56575f580d0fafd9e1ad9dbc96 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 26 Jun 2024 11:30:51 +0200 Subject: [PATCH 3/3] Rename --- llvm/lib/Target/AMDGPU/AMDGPU.td | 10 +- llvm/lib/Target/AMDGPU/GCNSubtarget.h | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 370992eb81ff33..bea233bfb27bd6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++
[llvm-branch-commits] [llvm] [RISCV] Support select optimization (PR #80124)
wangpc-pp wrote: Ping. I'd like to push this forward because we don't take branch probabilities into consideration now. Example: https://godbolt.org/z/doGhYadKM We should use branches instead of selects in this case and this patch (the enabling of SelectOpt) will optimize this. `clang -O3 -march=rv64gc_zba_zbb_zbc_zbs_zicond -Xclang -target-feature -Xclang +enable-select-opt -Xclang -target-feature -Xclang +predictable-select-expensive` ``` New Select group with %. = select i1 %cmp, i32 5, i32 13, !prof !9 Analyzing select group containing %. = select i1 %cmp, i32 5, i32 13, !prof !9 Converted to branch because of highly predictable branch. ``` ```asm func0: # @func0 li a2, 5 mul a1, a0, a0 bge a2, a0, .LBB0_2 addwa0, a1, a2 ret .LBB0_2:# %select.false li a2, 13 addwa0, a1, a2 ret ``` https://github.com/llvm/llvm-project/pull/80124 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [RISCV] Support select optimization (PR #80124)
https://github.com/wangpc-pp edited https://github.com/llvm/llvm-project/pull/80124 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [RISCV] Support select optimization (PR #80124)
https://github.com/wangpc-pp updated https://github.com/llvm/llvm-project/pull/80124 >From e3fb1fe7bdd4b7c24f9361c4d14dd1206fc8c067 Mon Sep 17 00:00:00 2001 From: wangpc Date: Sun, 18 Feb 2024 11:12:16 +0800 Subject: [PATCH 1/2] Move after addIRPasses Created using spr 1.3.4 --- llvm/lib/Target/RISCV/RISCVTargetMachine.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index fdf1c023fff87..7a26e1956424c 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -450,15 +450,15 @@ void RISCVPassConfig::addIRPasses() { if (EnableLoopDataPrefetch) addPass(createLoopDataPrefetchPass()); -if (EnableSelectOpt && getOptLevel() == CodeGenOptLevel::Aggressive) - addPass(createSelectOptimizePass()); - addPass(createRISCVGatherScatterLoweringPass()); addPass(createInterleavedAccessPass()); addPass(createRISCVCodeGenPreparePass()); } TargetPassConfig::addIRPasses(); + + if (getOptLevel() == CodeGenOptLevel::Aggressive && EnableSelectOpt) +addPass(createSelectOptimizePass()); } bool RISCVPassConfig::addPreISel() { >From 5d5398596dc30c47c67572ec20137fb3f9434940 Mon Sep 17 00:00:00 2001 From: wangpc Date: Wed, 21 Feb 2024 21:21:28 +0800 Subject: [PATCH 2/2] Fix test Created using spr 1.3.4 --- llvm/test/CodeGen/RISCV/O3-pipeline.ll | 18 +- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll index 62c1af52e6c20..8b52e3fe7b2f1 100644 --- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll +++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll @@ -34,15 +34,6 @@ ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Scalar Evolution Analysis ; CHECK-NEXT: Loop Data Prefetch -; CHECK-NEXT: Post-Dominator Tree Construction -; CHECK-NEXT: Branch Probability Analysis -; CHECK-NEXT: Block Frequency Analysis -; CHECK-NEXT: Lazy Branch Probability Analysis -; CHECK-NEXT: Lazy Block Frequency Analysis -; CHECK-NEXT: Optimization Remark Emitter -; CHECK-NEXT: Optimize selects -; CHECK-NEXT: Dominator Tree Construction -; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: RISC-V gather/scatter lowering ; CHECK-NEXT: Interleaved Access Pass ; CHECK-NEXT: RISC-V CodeGenPrepare @@ -77,6 +68,15 @@ ; CHECK-NEXT: Expand reduction intrinsics ; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: TLS Variable Hoist +; CHECK-NEXT: Post-Dominator Tree Construction +; CHECK-NEXT: Branch Probability Analysis +; CHECK-NEXT: Block Frequency Analysis +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis +; CHECK-NEXT: Optimization Remark Emitter +; CHECK-NEXT: Optimize selects +; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: CodeGen Prepare ; CHECK-NEXT: Dominator Tree Construction ; CHECK-NEXT: Exception handling preparation ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [RISCV] Support select optimization (PR #80124)
https://github.com/wangpc-pp updated https://github.com/llvm/llvm-project/pull/80124 >From e3fb1fe7bdd4b7c24f9361c4d14dd1206fc8c067 Mon Sep 17 00:00:00 2001 From: wangpc Date: Sun, 18 Feb 2024 11:12:16 +0800 Subject: [PATCH 1/2] Move after addIRPasses Created using spr 1.3.4 --- llvm/lib/Target/RISCV/RISCVTargetMachine.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index fdf1c023fff87..7a26e1956424c 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -450,15 +450,15 @@ void RISCVPassConfig::addIRPasses() { if (EnableLoopDataPrefetch) addPass(createLoopDataPrefetchPass()); -if (EnableSelectOpt && getOptLevel() == CodeGenOptLevel::Aggressive) - addPass(createSelectOptimizePass()); - addPass(createRISCVGatherScatterLoweringPass()); addPass(createInterleavedAccessPass()); addPass(createRISCVCodeGenPreparePass()); } TargetPassConfig::addIRPasses(); + + if (getOptLevel() == CodeGenOptLevel::Aggressive && EnableSelectOpt) +addPass(createSelectOptimizePass()); } bool RISCVPassConfig::addPreISel() { >From 5d5398596dc30c47c67572ec20137fb3f9434940 Mon Sep 17 00:00:00 2001 From: wangpc Date: Wed, 21 Feb 2024 21:21:28 +0800 Subject: [PATCH 2/2] Fix test Created using spr 1.3.4 --- llvm/test/CodeGen/RISCV/O3-pipeline.ll | 18 +- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll index 62c1af52e6c20..8b52e3fe7b2f1 100644 --- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll +++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll @@ -34,15 +34,6 @@ ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Scalar Evolution Analysis ; CHECK-NEXT: Loop Data Prefetch -; CHECK-NEXT: Post-Dominator Tree Construction -; CHECK-NEXT: Branch Probability Analysis -; CHECK-NEXT: Block Frequency Analysis -; CHECK-NEXT: Lazy Branch Probability Analysis -; CHECK-NEXT: Lazy Block Frequency Analysis -; CHECK-NEXT: Optimization Remark Emitter -; CHECK-NEXT: Optimize selects -; CHECK-NEXT: Dominator Tree Construction -; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: RISC-V gather/scatter lowering ; CHECK-NEXT: Interleaved Access Pass ; CHECK-NEXT: RISC-V CodeGenPrepare @@ -77,6 +68,15 @@ ; CHECK-NEXT: Expand reduction intrinsics ; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: TLS Variable Hoist +; CHECK-NEXT: Post-Dominator Tree Construction +; CHECK-NEXT: Branch Probability Analysis +; CHECK-NEXT: Block Frequency Analysis +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis +; CHECK-NEXT: Optimization Remark Emitter +; CHECK-NEXT: Optimize selects +; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: CodeGen Prepare ; CHECK-NEXT: Dominator Tree Construction ; CHECK-NEXT: Exception handling preparation ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits