from:"Stanislav Mekhanoshin via cfe\-commits"

r368917 - [AMDGPU] Do not assume a default GCN target

2019-08-14 Thread Stanislav Mekhanoshin via cfe-commits

Author: rampitec
Date: Wed Aug 14 13:55:15 2019
New Revision: 368917

URL: http://llvm.org/viewvc/llvm-project?rev=368917&view=rev
Log:
[AMDGPU] Do not assume a default GCN target

Differential Revision: https://reviews.llvm.org/D66246

Modified:
cfe/trunk/lib/Basic/Targets/AMDGPU.cpp
cfe/trunk/test/Driver/amdgpu-mcpu.cl

Modified: cfe/trunk/lib/Basic/Targets/AMDGPU.cpp
URL: 
http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Basic/Targets/AMDGPU.cpp?rev=368917&r1=368916&r2=368917&view=diff
==
--- cfe/trunk/lib/Basic/Targets/AMDGPU.cpp (original)
+++ cfe/trunk/lib/Basic/Targets/AMDGPU.cpp Wed Aug 14 13:55:15 2019
@@ -131,9 +131,6 @@ bool AMDGPUTargetInfo::initFeatureMap(
 
   // XXX - What does the member GPU mean if device name string passed here?
   if (isAMDGCN(getTriple())) {
-if (CPU.empty())
-  CPU = "gfx600";
-
 switch (llvm::AMDGPU::parseArchAMDGCN(CPU)) {
 case GK_GFX1012:
 case GK_GFX1011:
@@ -189,7 +186,7 @@ bool AMDGPUTargetInfo::initFeatureMap(
 case GK_GFX600:
   break;
 case GK_NONE:
-  return false;
+  break;
 default:
   llvm_unreachable("Unhandled GPU!");
 }

Modified: cfe/trunk/test/Driver/amdgpu-mcpu.cl
URL: 
http://llvm.org/viewvc/llvm-project/cfe/trunk/test/Driver/amdgpu-mcpu.cl?rev=368917&r1=368916&r2=368917&view=diff
==
--- cfe/trunk/test/Driver/amdgpu-mcpu.cl (original)
+++ cfe/trunk/test/Driver/amdgpu-mcpu.cl Wed Aug 14 13:55:15 2019
@@ -52,6 +52,7 @@
 // AMDGCN-based processors.
 //
 
+// RUN: %clang -### -target amdgcn %s 2>&1 | FileCheck 
--check-prefix=GCNDEFAULT %s
 // RUN: %clang -### -target amdgcn -mcpu=gfx600 %s 2>&1 | FileCheck 
--check-prefix=GFX600 %s
 // RUN: %clang -### -target amdgcn -mcpu=tahiti %s 2>&1 | FileCheck 
--check-prefix=TAHITI %s
 // RUN: %clang -### -target amdgcn -mcpu=gfx601 %s 2>&1 | FileCheck 
--check-prefix=GFX601 %s
@@ -90,6 +91,7 @@
 // RUN: %clang -### -target amdgcn -mcpu=gfx1011 %s 2>&1 | FileCheck 
--check-prefix=GFX1011 %s
 // RUN: %clang -### -target amdgcn -mcpu=gfx1012 %s 2>&1 | FileCheck 
--check-prefix=GFX1012 %s
 
+// GCNDEFAULT-NOT: -target-cpu
 // GFX600:"-target-cpu" "gfx600"
 // TAHITI:"-target-cpu" "tahiti"
 // GFX601:"-target-cpu" "gfx601"


___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

r365528 - [AMDGPU] gfx908 clang target

2019-07-09 Thread Stanislav Mekhanoshin via cfe-commits

Author: rampitec
Date: Tue Jul  9 11:19:00 2019
New Revision: 365528

URL: http://llvm.org/viewvc/llvm-project?rev=365528&view=rev
Log:
[AMDGPU] gfx908 clang target

Differential Revision: https://reviews.llvm.org/D64430

Modified:
cfe/trunk/include/clang/Basic/Cuda.h
cfe/trunk/lib/Basic/Cuda.cpp
cfe/trunk/lib/Basic/Targets/AMDGPU.cpp
cfe/trunk/lib/Basic/Targets/NVPTX.cpp
cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
cfe/trunk/test/CodeGenOpenCL/amdgpu-features.cl
cfe/trunk/test/Driver/amdgpu-macros.cl
cfe/trunk/test/Driver/amdgpu-mcpu.cl
cfe/trunk/test/Driver/cuda-bad-arch.cu

Modified: cfe/trunk/include/clang/Basic/Cuda.h
URL: 
http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/Cuda.h?rev=365528&r1=365527&r2=365528&view=diff
==
--- cfe/trunk/include/clang/Basic/Cuda.h (original)
+++ cfe/trunk/include/clang/Basic/Cuda.h Tue Jul  9 11:19:00 2019
@@ -64,6 +64,7 @@ enum class CudaArch {
   GFX902,
   GFX904,
   GFX906,
+  GFX908,
   GFX909,
   LAST,
 };

Modified: cfe/trunk/lib/Basic/Cuda.cpp
URL: 
http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Basic/Cuda.cpp?rev=365528&r1=365527&r2=365528&view=diff
==
--- cfe/trunk/lib/Basic/Cuda.cpp (original)
+++ cfe/trunk/lib/Basic/Cuda.cpp Tue Jul  9 11:19:00 2019
@@ -109,6 +109,8 @@ const char *CudaArchToString(CudaArch A)
 return "gfx904";
   case CudaArch::GFX906: // TBA
 return "gfx906";
+  case CudaArch::GFX908: // TBA
+return "gfx908";
   case CudaArch::GFX909: // TBA
 return "gfx909";
   }
@@ -147,6 +149,7 @@ CudaArch StringToCudaArch(llvm::StringRe
   .Case("gfx902", CudaArch::GFX902)
   .Case("gfx904", CudaArch::GFX904)
   .Case("gfx906", CudaArch::GFX906)
+  .Case("gfx908", CudaArch::GFX908)
   .Case("gfx909", CudaArch::GFX909)
   .Default(CudaArch::UNKNOWN);
 }
@@ -259,6 +262,7 @@ CudaVirtualArch VirtualArchForCudaArch(C
   case CudaArch::GFX902:
   case CudaArch::GFX904:
   case CudaArch::GFX906:
+  case CudaArch::GFX908:
   case CudaArch::GFX909:
 return CudaVirtualArch::COMPUTE_AMDGCN;
   }
@@ -306,6 +310,7 @@ CudaVersion MinVersionForCudaArch(CudaAr
   case CudaArch::GFX902:
   case CudaArch::GFX904:
   case CudaArch::GFX906:
+  case CudaArch::GFX908:
   case CudaArch::GFX909:
 return CudaVersion::CUDA_70;
   }

Modified: cfe/trunk/lib/Basic/Targets/AMDGPU.cpp
URL: 
http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Basic/Targets/AMDGPU.cpp?rev=365528&r1=365527&r2=365528&view=diff
==
--- cfe/trunk/lib/Basic/Targets/AMDGPU.cpp (original)
+++ cfe/trunk/lib/Basic/Targets/AMDGPU.cpp Tue Jul  9 11:19:00 2019
@@ -152,6 +152,12 @@ bool AMDGPUTargetInfo::initFeatureMap(
   Features["gfx10-insts"] = true;
   Features["s-memrealtime"] = true;
   break;
+case GK_GFX908:
+  Features["dot3-insts"] = true;
+  Features["dot4-insts"] = true;
+  Features["dot5-insts"] = true;
+  Features["dot6-insts"] = true;
+  LLVM_FALLTHROUGH;
 case GK_GFX906:
   Features["dl-insts"] = true;
   Features["dot1-insts"] = true;

Modified: cfe/trunk/lib/Basic/Targets/NVPTX.cpp
URL: 
http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Basic/Targets/NVPTX.cpp?rev=365528&r1=365527&r2=365528&view=diff
==
--- cfe/trunk/lib/Basic/Targets/NVPTX.cpp (original)
+++ cfe/trunk/lib/Basic/Targets/NVPTX.cpp Tue Jul  9 11:19:00 2019
@@ -191,6 +191,7 @@ void NVPTXTargetInfo::getTargetDefines(c
   case CudaArch::GFX902:
   case CudaArch::GFX904:
   case CudaArch::GFX906:
+  case CudaArch::GFX908:
   case CudaArch::GFX909:
   case CudaArch::LAST:
 break;

Modified: cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
URL: 
http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp?rev=365528&r1=365527&r2=365528&view=diff
==
--- cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp (original)
+++ cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp Tue Jul  9 11:19:00 2019
@@ -4928,6 +4928,7 @@ void CGOpenMPRuntimeNVPTX::checkArchForU
   case CudaArch::GFX902:
   case CudaArch::GFX904:
   case CudaArch::GFX906:
+  case CudaArch::GFX908:
   case CudaArch::GFX909:
   case CudaArch::UNKNOWN:
 break;
@@ -4982,6 +4983,7 @@ static std::pair get
   case CudaArch::GFX902:
   case CudaArch::GFX904:
   case CudaArch::GFX906:
+  case CudaArch::GFX908:
   case CudaArch::GFX909:
   case CudaArch::UNKNOWN:
 break;

Modified: cfe/trunk/test/CodeGenOpenCL/amdgpu-features.cl
URL: 
http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGenOpenCL/amdgpu-features.cl?rev=365528&r1=365527&r2=365528&view=diff

[clang] ea7d0e2 - [AMDGPU] gfx1031 target

2020-08-05 Thread Stanislav Mekhanoshin via cfe-commits


Author: Stanislav Mekhanoshin
Date: 2020-08-05T12:36:26-07:00
New Revision: ea7d0e2996ec6b72a08dbef26dadf217458ab382

URL: 
https://github.com/llvm/llvm-project/commit/ea7d0e2996ec6b72a08dbef26dadf217458ab382
DIFF: 
https://github.com/llvm/llvm-project/commit/ea7d0e2996ec6b72a08dbef26dadf217458ab382.diff

LOG: [AMDGPU] gfx1031 target

Differential Revision: https://reviews.llvm.org/D85337

Added: 


Modified: 
clang/include/clang/Basic/Cuda.h
clang/lib/Basic/Targets/AMDGPU.cpp
clang/lib/Basic/Targets/NVPTX.cpp
clang/test/CodeGenOpenCL/amdgpu-features.cl
clang/test/Driver/amdgpu-macros.cl
clang/test/Driver/amdgpu-mcpu.cl
llvm/docs/AMDGPUUsage.rst
llvm/include/llvm/BinaryFormat/ELF.h
llvm/include/llvm/Support/TargetParser.h
llvm/lib/ObjectYAML/ELFYAML.cpp
llvm/lib/Support/TargetParser.cpp
llvm/lib/Target/AMDGPU/GCNProcessors.td
llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmad.s32.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll
llvm/test/CodeGen/AMDGPU/hsa-note-no-func.ll
llvm/test/CodeGen/AMDGPU/idot8s.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.csub.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot8.ll
llvm/test/MC/AMDGPU/gfx1030_err.s
llvm/test/MC/AMDGPU/gfx1030_new.s
llvm/test/MC/Disassembler/AMDGPU/gfx1030_dasm_new.txt
llvm/tools/llvm-readobj/ELFDumper.cpp

Removed: 




diff  --git a/clang/include/clang/Basic/Cuda.h 
b/clang/include/clang/Basic/Cuda.h
index 1716325a99312..19301e825bcfd 100644
--- a/clang/include/clang/Basic/Cuda.h
+++ b/clang/include/clang/Basic/Cuda.h
@@ -75,6 +75,7 @@ enum class CudaArch {
   GFX1011,
   GFX1012,
   GFX1030,
+  GFX1031,
   LAST,
 };
 

diff  --git a/clang/lib/Basic/Targets/AMDGPU.cpp 
b/clang/lib/Basic/Targets/AMDGPU.cpp
index e147045110a99..57351c7557082 100644
--- a/clang/lib/Basic/Targets/AMDGPU.cpp
+++ b/clang/lib/Basic/Targets/AMDGPU.cpp
@@ -174,6 +174,7 @@ bool AMDGPUTargetInfo::initFeatureMap(
   // XXX - What does the member GPU mean if device name string passed here?
   if (isAMDGCN(getTriple())) {
 switch (llvm::AMDGPU::parseArchAMDGCN(CPU)) {
+case GK_GFX1031:
 case GK_GFX1030:
   Features["ci-insts"] = true;
   Features["dot1-insts"] = true;

diff  --git a/clang/lib/Basic/Targets/NVPTX.cpp 
b/clang/lib/Basic/Targets/NVPTX.cpp
index 18c3c83703310..ef61b8f78946c 100644
--- a/clang/lib/Basic/Targets/NVPTX.cpp
+++ b/clang/lib/Basic/Targets/NVPTX.cpp
@@ -201,6 +201,7 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions 
&Opts,
   case CudaArch::GFX1011:
   case CudaArch::GFX1012:
   case CudaArch::GFX1030:
+  case CudaArch::GFX1031:
   case CudaArch::LAST:
 break;
   case CudaArch::UNKNOWN:

diff  --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl 
b/clang/test/CodeGenOpenCL/amdgpu-features.cl
index 4c26163237980..93357a48eb89a 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -14,6 +14,7 @@
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1011 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX1011 %s
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1012 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX1012 %s
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1030 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX1030 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1031 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX1031 %s
 
 // GFX600-NOT: "target-features"
 // GFX601-NOT: "target-features"
@@ -26,5 +27,6 @@
 // GFX1011: 
"target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime"
 // GFX1012: 
"target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime"
 // GFX1030: 
"target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime"
+// GFX1031: 
"target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime"
 
 kernel void test() {}

diff  --git a/clang/test/Driver/amdgpu-macros.cl 
b/clang/test/Driver/amdgpu-macros.cl
index 24d2fead28d5e..ae46e85f94794 100644
--- a/clang/test/Driver/amdgpu-macros.cl
+++ b/clang/test/Driver/amdgpu-macros.cl
@@ -181,6 +181,7 @@
 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx1011 %s 2>&1 | FileCheck 
--check-prefixes=ARCH-GCN,GFX1011 %s

[clang] 105608a - [AMDGPU] Added missing gfx1031 cases to CGOpenMPRuntimeGPU.cpp

2020-08-05 Thread Stanislav Mekhanoshin via cfe-commits


Author: Stanislav Mekhanoshin
Date: 2020-08-05T12:39:03-07:00
New Revision: 105608a4c2821ca8f8340104614c1176ed1ed82d

URL: 
https://github.com/llvm/llvm-project/commit/105608a4c2821ca8f8340104614c1176ed1ed82d
DIFF: 
https://github.com/llvm/llvm-project/commit/105608a4c2821ca8f8340104614c1176ed1ed82d.diff

LOG: [AMDGPU] Added missing gfx1031 cases to CGOpenMPRuntimeGPU.cpp

Added: 


Modified: 
clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp

Removed: 




diff  --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp 
b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 452eb15eb8d1..9440758a85b6 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -5014,6 +5014,7 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(
   case CudaArch::GFX1011:
   case CudaArch::GFX1012:
   case CudaArch::GFX1030:
+  case CudaArch::GFX1031:
   case CudaArch::UNKNOWN:
 break;
   case CudaArch::LAST:
@@ -5074,6 +5075,7 @@ static std::pair 
getSMsBlocksPerSM(CodeGenModule &CGM) {
   case CudaArch::GFX1011:
   case CudaArch::GFX1012:
   case CudaArch::GFX1030:
+  case CudaArch::GFX1031:
   case CudaArch::UNKNOWN:
 break;
   case CudaArch::LAST:



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] 58de24c - [AMDGPU] Sorted targets in amdgpu-features.cl. NFC.

2020-06-12 Thread Stanislav Mekhanoshin via cfe-commits


Author: Stanislav Mekhanoshin
Date: 2020-06-12T11:57:40-07:00
New Revision: 58de24ce6cb413afea1470ec183f3fc5d9ca6817

URL: 
https://github.com/llvm/llvm-project/commit/58de24ce6cb413afea1470ec183f3fc5d9ca6817
DIFF: 
https://github.com/llvm/llvm-project/commit/58de24ce6cb413afea1470ec183f3fc5d9ca6817.diff

LOG: [AMDGPU] Sorted targets in amdgpu-features.cl. NFC.

Added: 


Modified: 
clang/test/CodeGenOpenCL/amdgpu-features.cl

Removed: 




diff  --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl 
b/clang/test/CodeGenOpenCL/amdgpu-features.cl
index 7529a4d4abb1..344d4bca44c9 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -3,26 +3,26 @@
 // Check that appropriate features are defined for every supported AMDGPU
 // "-target" and "-mcpu" options.
 
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx600 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX600 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx601 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX601 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx700 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX700 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx801 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX801 %s
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx904 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX904 %s
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx906 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX906 %s
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx908 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX908 %s
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1010 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX1010 %s
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1011 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX1011 %s
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1012 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX1012 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx801 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX801 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx700 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX700 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx600 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX600 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx601 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX601 %s
 
+// GFX600-NOT: "target-features"
+// GFX601-NOT: "target-features"
+// GFX700: "target-features"="+ci-insts,+flat-address-space"
+// GFX801: 
"target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+s-memrealtime"
 // GFX904: 
"target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime"
 // GFX906: 
"target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime"
 // GFX908: 
"target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+mai-insts,+s-memrealtime"
 // GFX1010: 
"target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+flat-address-space,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime"
 // GFX1011: 
"target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime"
 // GFX1012: 
"target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime"
-// GFX801: 
"target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+s-memrealtime"
-// GFX700: "target-features"="+ci-insts,+flat-address-space"
-// GFX600-NOT: "target-features"
-// GFX601-NOT: "target-features"
 
 kernel void test() {}



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

r363341 - [AMDGPU] gfx1010 wave32 clang support

2019-06-13 Thread Stanislav Mekhanoshin via cfe-commits

Author: rampitec
Date: Thu Jun 13 16:47:59 2019
New Revision: 363341

URL: http://llvm.org/viewvc/llvm-project?rev=363341&view=rev
Log:
[AMDGPU] gfx1010 wave32 clang support

Differential Revision: https://reviews.llvm.org/D63209

Modified:
cfe/trunk/docs/ClangCommandLineReference.rst
cfe/trunk/include/clang/Driver/Options.td
cfe/trunk/lib/CodeGen/CGBuiltin.cpp
cfe/trunk/lib/Driver/ToolChains/AMDGPU.cpp
cfe/trunk/lib/Driver/ToolChains/HIP.cpp
cfe/trunk/test/CodeGenOpenCL/builtins-amdgcn.cl
cfe/trunk/test/Driver/amdgpu-features.c

Modified: cfe/trunk/docs/ClangCommandLineReference.rst
URL: 
http://llvm.org/viewvc/llvm-project/cfe/trunk/docs/ClangCommandLineReference.rst?rev=363341&r1=363340&r2=363341&view=diff
==
--- cfe/trunk/docs/ClangCommandLineReference.rst (original)
+++ cfe/trunk/docs/ClangCommandLineReference.rst Thu Jun 13 16:47:59 2019
@@ -2401,6 +2401,10 @@ AMDGPU
 CU wavefront execution mode is used if enabled and WGP wavefront execution mode
 is used if disabled (AMDGPU only)
 
+.. option:: -mwavefrontsize64, -mno-wavefrontsize64
+
+Wavefront size 64 is used if enabled and wavefront size 32 if disabled (AMDGPU 
only)
+
 .. option:: -mxnack, -mno-xnack
 
 Enable XNACK (AMDGPU only)

Modified: cfe/trunk/include/clang/Driver/Options.td
URL: 
http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Driver/Options.td?rev=363341&r1=363340&r2=363341&view=diff
==
--- cfe/trunk/include/clang/Driver/Options.td (original)
+++ cfe/trunk/include/clang/Driver/Options.td Thu Jun 13 16:47:59 2019
@@ -2216,6 +2216,11 @@ def mcumode : Flag<["-"], "mcumode">, Gr
 def mno_cumode : Flag<["-"], "mno-cumode">, Group,
   HelpText<"WGP wavefront execution mode is used (AMDGPU only)">;
 
+def mwavefrontsize64 : Flag<["-"], "mwavefrontsize64">,
+  Group, HelpText<"Wavefront size 64 is used">;
+def mno_wavefrontsize64 : Flag<["-"], "mno-wavefrontsize64">,
+  Group, HelpText<"Wavefront size 32 is used">;
+
 def faltivec : Flag<["-"], "faltivec">, Group, Flags<[DriverOption]>;
 def fno_altivec : Flag<["-"], "fno-altivec">, Group, 
Flags<[DriverOption]>;
 def maltivec : Flag<["-"], "maltivec">, Group;

Modified: cfe/trunk/lib/CodeGen/CGBuiltin.cpp
URL: 
http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/CGBuiltin.cpp?rev=363341&r1=363340&r2=363341&view=diff
==
--- cfe/trunk/lib/CodeGen/CGBuiltin.cpp (original)
+++ cfe/trunk/lib/CodeGen/CGBuiltin.cpp Thu Jun 13 16:47:59 2019
@@ -12736,11 +12736,27 @@ Value *CodeGenFunction::EmitAMDGPUBuilti
   case AMDGPU::BI__builtin_amdgcn_uicmp:
   case AMDGPU::BI__builtin_amdgcn_uicmpl:
   case AMDGPU::BI__builtin_amdgcn_sicmp:
-  case AMDGPU::BI__builtin_amdgcn_sicmpl:
-return emitTernaryBuiltin(*this, E, Intrinsic::amdgcn_icmp);
+  case AMDGPU::BI__builtin_amdgcn_sicmpl: {
+llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
+llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
+llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
+
+// FIXME-GFX10: How should 32 bit mask be handled?
+Value *F = CGM.getIntrinsic(Intrinsic::amdgcn_icmp,
+  { Builder.getInt64Ty(), Src0->getType() });
+return Builder.CreateCall(F, { Src0, Src1, Src2 });
+  }
   case AMDGPU::BI__builtin_amdgcn_fcmp:
-  case AMDGPU::BI__builtin_amdgcn_fcmpf:
-return emitTernaryBuiltin(*this, E, Intrinsic::amdgcn_fcmp);
+  case AMDGPU::BI__builtin_amdgcn_fcmpf: {
+llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
+llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
+llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
+
+// FIXME-GFX10: How should 32 bit mask be handled?
+Value *F = CGM.getIntrinsic(Intrinsic::amdgcn_fcmp,
+  { Builder.getInt64Ty(), Src0->getType() });
+return Builder.CreateCall(F, { Src0, Src1, Src2 });
+  }
   case AMDGPU::BI__builtin_amdgcn_class:
   case AMDGPU::BI__builtin_amdgcn_classf:
   case AMDGPU::BI__builtin_amdgcn_classh:

Modified: cfe/trunk/lib/Driver/ToolChains/AMDGPU.cpp
URL: 
http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Driver/ToolChains/AMDGPU.cpp?rev=363341&r1=363340&r2=363341&view=diff
==
--- cfe/trunk/lib/Driver/ToolChains/AMDGPU.cpp (original)
+++ cfe/trunk/lib/Driver/ToolChains/AMDGPU.cpp Thu Jun 13 16:47:59 2019
@@ -41,6 +41,17 @@ void amdgpu::getAMDGPUTargetFeatures(con
   if (const Arg *dAbi = Args.getLastArg(options::OPT_mamdgpu_debugger_abi))
 D.Diag(diag::err_drv_clang_unsupported) << dAbi->getAsString(Args);
 
+  if (Args.getLastArg(options::OPT_mwavefrontsize64)) {
+Features.push_back("-wavefrontsize16");
+Features.push_back("-wavefrontsize32");
+Features.push_back("+wavefrontsize64");
+  }
+  if (Args.getLastArg(options::OPT_mno_wavefrontsize64)) {
+Features.push_back

r363345 - [AMDGPU] gfx1011/gfx1012 clang support

2019-06-13 Thread Stanislav Mekhanoshin via cfe-commits

Author: rampitec
Date: Thu Jun 13 17:33:59 2019
New Revision: 363345

URL: http://llvm.org/viewvc/llvm-project?rev=363345&view=rev
Log:
[AMDGPU] gfx1011/gfx1012 clang support

Differential Revision: https://reviews.llvm.org/D63308

Modified:
cfe/trunk/lib/Basic/Targets/AMDGPU.cpp
cfe/trunk/test/CodeGenOpenCL/amdgpu-features.cl
cfe/trunk/test/Driver/amdgpu-macros.cl
cfe/trunk/test/Driver/amdgpu-mcpu.cl

Modified: cfe/trunk/lib/Basic/Targets/AMDGPU.cpp
URL: 
http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Basic/Targets/AMDGPU.cpp?rev=363345&r1=363344&r2=363345&view=diff
==
--- cfe/trunk/lib/Basic/Targets/AMDGPU.cpp (original)
+++ cfe/trunk/lib/Basic/Targets/AMDGPU.cpp Thu Jun 13 17:33:59 2019
@@ -135,6 +135,13 @@ bool AMDGPUTargetInfo::initFeatureMap(
   CPU = "gfx600";
 
 switch (llvm::AMDGPU::parseArchAMDGCN(CPU)) {
+case GK_GFX1012:
+case GK_GFX1011:
+  Features["dot1-insts"] = true;
+  Features["dot2-insts"] = true;
+  Features["dot5-insts"] = true;
+  Features["dot6-insts"] = true;
+  LLVM_FALLTHROUGH;
 case GK_GFX1010:
   Features["dl-insts"] = true;
   Features["16-bit-insts"] = true;

Modified: cfe/trunk/test/CodeGenOpenCL/amdgpu-features.cl
URL: 
http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGenOpenCL/amdgpu-features.cl?rev=363345&r1=363344&r2=363345&view=diff
==
--- cfe/trunk/test/CodeGenOpenCL/amdgpu-features.cl (original)
+++ cfe/trunk/test/CodeGenOpenCL/amdgpu-features.cl Thu Jun 13 17:33:59 2019
@@ -6,6 +6,8 @@
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx904 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX904 %s
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx906 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX906 %s
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1010 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX1010 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1011 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX1011 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1012 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX1012 %s
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx801 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX801 %s
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx700 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX700 %s
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx600 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX600 %s
@@ -14,6 +16,8 @@
 // GFX904: 
"target-features"="+16-bit-insts,+ci-insts,+dpp,+fp32-denormals,+fp64-fp16-denormals,+gfx8-insts,+gfx9-insts,+s-memrealtime"
 // GFX906: 
"target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dpp,+fp32-denormals,+fp64-fp16-denormals,+gfx8-insts,+gfx9-insts,+s-memrealtime"
 // GFX1010: 
"target-features"="+16-bit-insts,+dl-insts,+dpp,+fp32-denormals,+fp64-fp16-denormals,+gfx10-insts,+gfx9-insts,+s-memrealtime"
+// GFX1011: 
"target-features"="+16-bit-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+fp32-denormals,+fp64-fp16-denormals,+gfx10-insts,+gfx9-insts,+s-memrealtime"
+// GFX1012: 
"target-features"="+16-bit-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+fp32-denormals,+fp64-fp16-denormals,+gfx10-insts,+gfx9-insts,+s-memrealtime"
 // GFX801: 
"target-features"="+16-bit-insts,+ci-insts,+dpp,+fp32-denormals,+fp64-fp16-denormals,+gfx8-insts,+s-memrealtime"
 // GFX700: "target-features"="+ci-insts,+fp64-fp16-denormals,-fp32-denormals"
 // GFX600: "target-features"="+fp64-fp16-denormals,-fp32-denormals"

Modified: cfe/trunk/test/Driver/amdgpu-macros.cl
URL: 
http://llvm.org/viewvc/llvm-project/cfe/trunk/test/Driver/amdgpu-macros.cl?rev=363345&r1=363344&r2=363345&view=diff
==
--- cfe/trunk/test/Driver/amdgpu-macros.cl (original)
+++ cfe/trunk/test/Driver/amdgpu-macros.cl Thu Jun 13 17:33:59 2019
@@ -177,6 +177,8 @@
 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx906 %s 2>&1 | FileCheck 
--check-prefixes=ARCH-GCN,GFX906 %s
 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx909 %s 2>&1 | FileCheck 
--check-prefixes=ARCH-GCN,GFX909 %s
 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck 
--check-prefixes=ARCH-GCN,GFX1010 %s
+// RUN: %clang -E -dM -target amdgcn -mcpu=gfx1011 %s 2>&1 | FileCheck 
--check-prefixes=ARCH-GCN,GFX1011 %s
+// RUN: %clang -E -dM -target amdgcn -mcpu=gfx1012 %s 2>&1 | FileCheck 
--check-prefixes=ARCH-GCN,GFX1012 %s
 
 // GFX600-DAG: #define FP_FAST_FMA 1
 // GFX601-DAG: #define FP_FAST_FMA 1
@@ -195,6 +197,8 @@
 // GFX906-DAG: #define FP_FAST_FMA 1
 // GFX909-DAG: #define FP_FAST_FMA 1
 // GFX1010-DAG: #define FP_FAST_FMA 1
+// GFX1011-DAG: #define FP_FAST_FMA 1
+// GFX1012-DAG: #define FP_FAST_FMA 1
 
 // GFX600-DAG: #define FP_FAST_FMAF 1
 // GFX601-NOT: #d

r357792 - [AMDGPU] rename vi-insts into gfx8-insts

2019-04-05 Thread Stanislav Mekhanoshin via cfe-commits

Author: rampitec
Date: Fri Apr  5 11:25:00 2019
New Revision: 357792

URL: http://llvm.org/viewvc/llvm-project?rev=357792&view=rev
Log:
[AMDGPU] rename vi-insts into gfx8-insts

Differential Revision: https://reviews.llvm.org/D60293

Modified:
cfe/trunk/include/clang/Basic/BuiltinsAMDGPU.def
cfe/trunk/lib/Basic/Targets/AMDGPU.cpp
cfe/trunk/test/CodeGenOpenCL/amdgpu-features.cl
cfe/trunk/test/SemaOpenCL/builtins-amdgcn-error-vi.cl

Modified: cfe/trunk/include/clang/Basic/BuiltinsAMDGPU.def
URL: 
http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/BuiltinsAMDGPU.def?rev=357792&r1=357791&r2=357792&view=diff
==
--- cfe/trunk/include/clang/Basic/BuiltinsAMDGPU.def (original)
+++ cfe/trunk/include/clang/Basic/BuiltinsAMDGPU.def Fri Apr  5 11:25:00 2019
@@ -133,7 +133,7 @@ TARGET_BUILTIN(__builtin_amdgcn_classh,
 TARGET_BUILTIN(__builtin_amdgcn_s_memrealtime, "LUi", "n", "s-memrealtime")
 TARGET_BUILTIN(__builtin_amdgcn_mov_dpp, "iiIiIiIiIb", "nc", "dpp")
 TARGET_BUILTIN(__builtin_amdgcn_update_dpp, "iiiIiIiIiIb", "nc", "dpp")
-TARGET_BUILTIN(__builtin_amdgcn_s_dcache_wb, "v", "n", "vi-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_dcache_wb, "v", "n", "gfx8-insts")
 
 
//===--===//
 // GFX9+ only builtins.

Modified: cfe/trunk/lib/Basic/Targets/AMDGPU.cpp
URL: 
http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Basic/Targets/AMDGPU.cpp?rev=357792&r1=357791&r2=357792&view=diff
==
--- cfe/trunk/lib/Basic/Targets/AMDGPU.cpp (original)
+++ cfe/trunk/lib/Basic/Targets/AMDGPU.cpp Fri Apr  5 11:25:00 2019
@@ -150,7 +150,7 @@ bool AMDGPUTargetInfo::initFeatureMap(
 case GK_GFX803:
 case GK_GFX802:
 case GK_GFX801:
-  Features["vi-insts"] = true;
+  Features["gfx8-insts"] = true;
   Features["16-bit-insts"] = true;
   Features["dpp"] = true;
   Features["s-memrealtime"] = true;

Modified: cfe/trunk/test/CodeGenOpenCL/amdgpu-features.cl
URL: 
http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGenOpenCL/amdgpu-features.cl?rev=357792&r1=357791&r2=357792&view=diff
==
--- cfe/trunk/test/CodeGenOpenCL/amdgpu-features.cl (original)
+++ cfe/trunk/test/CodeGenOpenCL/amdgpu-features.cl Fri Apr  5 11:25:00 2019
@@ -10,9 +10,9 @@
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx600 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX600 %s
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx601 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX601 %s
 
-// GFX904: 
"target-features"="+16-bit-insts,+ci-insts,+dpp,+fp32-denormals,+fp64-fp16-denormals,+gfx9-insts,+s-memrealtime,+vi-insts"
-// GFX906: 
"target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dpp,+fp32-denormals,+fp64-fp16-denormals,+gfx9-insts,+s-memrealtime,+vi-insts"
-// GFX801: 
"target-features"="+16-bit-insts,+ci-insts,+dpp,+fp32-denormals,+fp64-fp16-denormals,+s-memrealtime,+vi-insts"
+// GFX904: 
"target-features"="+16-bit-insts,+ci-insts,+dpp,+fp32-denormals,+fp64-fp16-denormals,+gfx8-insts,+gfx9-insts,+s-memrealtime"
+// GFX906: 
"target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dpp,+fp32-denormals,+fp64-fp16-denormals,+gfx8-insts,+gfx9-insts,+s-memrealtime"
+// GFX801: 
"target-features"="+16-bit-insts,+ci-insts,+dpp,+fp32-denormals,+fp64-fp16-denormals,+gfx8-insts,+s-memrealtime"
 // GFX700: "target-features"="+ci-insts,+fp64-fp16-denormals,-fp32-denormals"
 // GFX600: "target-features"="+fp64-fp16-denormals,-fp32-denormals"
 // GFX601: "target-features"="+fp64-fp16-denormals,-fp32-denormals"

Modified: cfe/trunk/test/SemaOpenCL/builtins-amdgcn-error-vi.cl
URL: 
http://llvm.org/viewvc/llvm-project/cfe/trunk/test/SemaOpenCL/builtins-amdgcn-error-vi.cl?rev=357792&r1=357791&r2=357792&view=diff
==
--- cfe/trunk/test/SemaOpenCL/builtins-amdgcn-error-vi.cl (original)
+++ cfe/trunk/test/SemaOpenCL/builtins-amdgcn-error-vi.cl Fri Apr  5 11:25:00 
2019
@@ -4,5 +4,5 @@
 
 void test_vi_s_dcache_wb()
 {
-  __builtin_amdgcn_s_dcache_wb(); // expected-error 
{{'__builtin_amdgcn_s_dcache_wb' needs target feature vi-insts}}
+  __builtin_amdgcn_s_dcache_wb(); // expected-error 
{{'__builtin_amdgcn_s_dcache_wb' needs target feature gfx8-insts}}
 }


___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

r360634 - [AMDGPU] gfx1010 clang target

2019-05-13 Thread Stanislav Mekhanoshin via cfe-commits

Author: rampitec
Date: Mon May 13 16:15:59 2019
New Revision: 360634

URL: http://llvm.org/viewvc/llvm-project?rev=360634&view=rev
Log:
[AMDGPU] gfx1010 clang target

Differential Revision: https://reviews.llvm.org/D61875

Modified:
cfe/trunk/docs/ClangCommandLineReference.rst
cfe/trunk/include/clang/Driver/Options.td
cfe/trunk/lib/Basic/Targets/AMDGPU.cpp
cfe/trunk/lib/Basic/Targets/AMDGPU.h
cfe/trunk/lib/Driver/ToolChains/HIP.cpp
cfe/trunk/test/CodeGenOpenCL/amdgpu-features.cl
cfe/trunk/test/Driver/amdgpu-features.c
cfe/trunk/test/Driver/amdgpu-macros.cl
cfe/trunk/test/Driver/amdgpu-mcpu.cl

Modified: cfe/trunk/docs/ClangCommandLineReference.rst
URL: 
http://llvm.org/viewvc/llvm-project/cfe/trunk/docs/ClangCommandLineReference.rst?rev=360634&r1=360633&r2=360634&view=diff
==
--- cfe/trunk/docs/ClangCommandLineReference.rst (original)
+++ cfe/trunk/docs/ClangCommandLineReference.rst Mon May 13 16:15:59 2019
@@ -2396,6 +2396,11 @@ Generate code which only uses the genera
 
 AMDGPU
 --
+.. option:: -mcumode, -mno-cumode
+
+CU wavefront execution mode is used if enabled and WGP wavefront execution mode
+is used if disabled (AMDGPU only)
+
 .. option:: -mxnack, -mno-xnack
 
 Enable XNACK (AMDGPU only)

Modified: cfe/trunk/include/clang/Driver/Options.td
URL: 
http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Driver/Options.td?rev=360634&r1=360633&r2=360634&view=diff
==
--- cfe/trunk/include/clang/Driver/Options.td (original)
+++ cfe/trunk/include/clang/Driver/Options.td Mon May 13 16:15:59 2019
@@ -2202,6 +2202,11 @@ def msram_ecc : Flag<["-"], "msram-ecc">
 def mno_sram_ecc : Flag<["-"], "mno-sram-ecc">, Group,
   HelpText<"Disable SRAM ECC (AMDGPU only)">;
 
+def mcumode : Flag<["-"], "mcumode">, Group,
+  HelpText<"CU wavefront execution mode is used (AMDGPU only)">;
+def mno_cumode : Flag<["-"], "mno-cumode">, Group,
+  HelpText<"WGP wavefront execution mode is used (AMDGPU only)">;
+
 def faltivec : Flag<["-"], "faltivec">, Group, Flags<[DriverOption]>;
 def fno_altivec : Flag<["-"], "fno-altivec">, Group, 
Flags<[DriverOption]>;
 def maltivec : Flag<["-"], "maltivec">, Group;

Modified: cfe/trunk/lib/Basic/Targets/AMDGPU.cpp
URL: 
http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Basic/Targets/AMDGPU.cpp?rev=360634&r1=360633&r2=360634&view=diff
==
--- cfe/trunk/lib/Basic/Targets/AMDGPU.cpp (original)
+++ cfe/trunk/lib/Basic/Targets/AMDGPU.cpp Mon May 13 16:15:59 2019
@@ -135,6 +135,14 @@ bool AMDGPUTargetInfo::initFeatureMap(
   CPU = "gfx600";
 
 switch (llvm::AMDGPU::parseArchAMDGCN(CPU)) {
+case GK_GFX1010:
+  Features["dl-insts"] = true;
+  Features["16-bit-insts"] = true;
+  Features["dpp"] = true;
+  Features["gfx9-insts"] = true;
+  Features["gfx10-insts"] = true;
+  Features["s-memrealtime"] = true;
+  break;
 case GK_GFX906:
   Features["dl-insts"] = true;
   Features["dot1-insts"] = true;

Modified: cfe/trunk/lib/Basic/Targets/AMDGPU.h
URL: 
http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Basic/Targets/AMDGPU.h?rev=360634&r1=360633&r2=360634&view=diff
==
--- cfe/trunk/lib/Basic/Targets/AMDGPU.h (original)
+++ cfe/trunk/lib/Basic/Targets/AMDGPU.h Mon May 13 16:15:59 2019
@@ -41,7 +41,6 @@ class LLVM_LIBRARY_VISIBILITY AMDGPUTarg
   llvm::AMDGPU::GPUKind GPUKind;
   unsigned GPUFeatures;
 
-
   bool hasFP64() const {
 return getTriple().getArch() == llvm::Triple::amdgcn ||
!!(GPUFeatures & llvm::AMDGPU::FEATURE_FP64);

Modified: cfe/trunk/lib/Driver/ToolChains/HIP.cpp
URL: 
http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Driver/ToolChains/HIP.cpp?rev=360634&r1=360633&r2=360634&view=diff
==
--- cfe/trunk/lib/Driver/ToolChains/HIP.cpp (original)
+++ cfe/trunk/lib/Driver/ToolChains/HIP.cpp Mon May 13 16:15:59 2019
@@ -307,8 +307,8 @@ void HIPToolChain::addClangTargetOptions
   if (BCLibs.empty()) {
 // Get the bc lib file name for ISA version. For example,
 // gfx803 => oclc_isa_version_803.amdgcn.bc.
-std::string ISAVerBC =
-"oclc_isa_version_" + GpuArch.drop_front(3).str() + ".amdgcn.bc";
+std::string GFXVersion = GpuArch.drop_front(3).str();
+std::string ISAVerBC = "oclc_isa_version_" + GFXVersion + ".amdgcn.bc";
 
 llvm::StringRef FlushDenormalControlBC;
 if (DriverArgs.hasArg(options::OPT_fcuda_flush_denormals_to_zero))

Modified: cfe/trunk/test/CodeGenOpenCL/amdgpu-features.cl
URL: 
http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGenOpenCL/amdgpu-features.cl?rev=360634&r1=360633&r2=360634&view=diff
=

r350794 - [AMDGPU] Separate feature dot-insts

2019-01-09 Thread Stanislav Mekhanoshin via cfe-commits

Author: rampitec
Date: Wed Jan  9 19:25:47 2019
New Revision: 350794

URL: http://llvm.org/viewvc/llvm-project?rev=350794&view=rev
Log:
[AMDGPU] Separate feature dot-insts

Differential Revision: https://reviews.llvm.org/D56525

Modified:
cfe/trunk/include/clang/Basic/BuiltinsAMDGPU.def
cfe/trunk/lib/Basic/Targets/AMDGPU.cpp
cfe/trunk/test/CodeGenOpenCL/amdgpu-features.cl
cfe/trunk/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl

Modified: cfe/trunk/include/clang/Basic/BuiltinsAMDGPU.def
URL: 
http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/BuiltinsAMDGPU.def?rev=350794&r1=350793&r2=350794&view=diff
==
--- cfe/trunk/include/clang/Basic/BuiltinsAMDGPU.def (original)
+++ cfe/trunk/include/clang/Basic/BuiltinsAMDGPU.def Wed Jan  9 19:25:47 2019
@@ -135,13 +135,13 @@ TARGET_BUILTIN(__builtin_amdgcn_fmed3h,
 // Deep learning builtins.
 
//===--===//
 
-TARGET_BUILTIN(__builtin_amdgcn_fdot2, "fV2hV2hfIb", "nc", "dl-insts")
-TARGET_BUILTIN(__builtin_amdgcn_sdot2, "SiV2SsV2SsSiIb", "nc", "dl-insts")
-TARGET_BUILTIN(__builtin_amdgcn_udot2, "UiV2UsV2UsUiIb", "nc", "dl-insts")
-TARGET_BUILTIN(__builtin_amdgcn_sdot4, "SiSiSiSiIb", "nc", "dl-insts")
-TARGET_BUILTIN(__builtin_amdgcn_udot4, "UiUiUiUiIb", "nc", "dl-insts")
-TARGET_BUILTIN(__builtin_amdgcn_sdot8, "SiSiSiSiIb", "nc", "dl-insts")
-TARGET_BUILTIN(__builtin_amdgcn_udot8, "UiUiUiUiIb", "nc", "dl-insts")
+TARGET_BUILTIN(__builtin_amdgcn_fdot2, "fV2hV2hfIb", "nc", "dot-insts")
+TARGET_BUILTIN(__builtin_amdgcn_sdot2, "SiV2SsV2SsSiIb", "nc", "dot-insts")
+TARGET_BUILTIN(__builtin_amdgcn_udot2, "UiV2UsV2UsUiIb", "nc", "dot-insts")
+TARGET_BUILTIN(__builtin_amdgcn_sdot4, "SiSiSiSiIb", "nc", "dot-insts")
+TARGET_BUILTIN(__builtin_amdgcn_udot4, "UiUiUiUiIb", "nc", "dot-insts")
+TARGET_BUILTIN(__builtin_amdgcn_sdot8, "SiSiSiSiIb", "nc", "dot-insts")
+TARGET_BUILTIN(__builtin_amdgcn_udot8, "UiUiUiUiIb", "nc", "dot-insts")
 
 
//===--===//
 // Special builtins.

Modified: cfe/trunk/lib/Basic/Targets/AMDGPU.cpp
URL: 
http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Basic/Targets/AMDGPU.cpp?rev=350794&r1=350793&r2=350794&view=diff
==
--- cfe/trunk/lib/Basic/Targets/AMDGPU.cpp (original)
+++ cfe/trunk/lib/Basic/Targets/AMDGPU.cpp Wed Jan  9 19:25:47 2019
@@ -137,6 +137,7 @@ bool AMDGPUTargetInfo::initFeatureMap(
 switch (llvm::AMDGPU::parseArchAMDGCN(CPU)) {
 case GK_GFX906:
   Features["dl-insts"] = true;
+  Features["dot-insts"] = true;
   LLVM_FALLTHROUGH;
 case GK_GFX909:
 case GK_GFX904:

Modified: cfe/trunk/test/CodeGenOpenCL/amdgpu-features.cl
URL: 
http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGenOpenCL/amdgpu-features.cl?rev=350794&r1=350793&r2=350794&view=diff
==
--- cfe/trunk/test/CodeGenOpenCL/amdgpu-features.cl (original)
+++ cfe/trunk/test/CodeGenOpenCL/amdgpu-features.cl Wed Jan  9 19:25:47 2019
@@ -11,7 +11,7 @@
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx601 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX601 %s
 
 // GFX904: 
"target-features"="+16-bit-insts,+ci-insts,+dpp,+fp32-denormals,+fp64-fp16-denormals,+gfx9-insts,+s-memrealtime,+vi-insts"
-// GFX906: 
"target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+fp32-denormals,+fp64-fp16-denormals,+gfx9-insts,+s-memrealtime,+vi-insts"
+// GFX906: 
"target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot-insts,+dpp,+fp32-denormals,+fp64-fp16-denormals,+gfx9-insts,+s-memrealtime,+vi-insts"
 // GFX801: 
"target-features"="+16-bit-insts,+ci-insts,+dpp,+fp32-denormals,+fp64-fp16-denormals,+s-memrealtime,+vi-insts"
 // GFX700: "target-features"="+ci-insts,+fp64-fp16-denormals,-fp32-denormals"
 // GFX600: "target-features"="+fp64-fp16-denormals,-fp32-denormals"

Modified: cfe/trunk/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl
URL: 
http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl?rev=350794&r1=350793&r2=350794&view=diff
==
--- cfe/trunk/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl (original)
+++ cfe/trunk/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl Wed Jan  9 
19:25:47 2019
@@ -12,24 +12,24 @@ kernel void builtins_amdgcn_dl_insts_err
 half2 v2hA, half2 v2hB, float fC,
 short2 v2ssA, short2 v2ssB, int siA, int siB, int siC,
 ushort2 v2usA, ushort2 v2usB, uint uiA, uint uiB, uint uiC) {
-  fOut[0] = __builtin_amdgcn_fdot2(v2hA, v2hB, fC, false); // 
expected-error {{'__builtin_amdgcn_fdot2' needs target feature dl-insts}}
-  fOut[1] = __builtin_amdgcn_fdot2(v2hA, v2hB, fC, true);  // 
expected-error {{'__buil

r293190 - Use TargetMachine adjustPassManager hook

2017-01-26 Thread Stanislav Mekhanoshin via cfe-commits

Author: rampitec
Date: Thu Jan 26 10:49:21 2017
New Revision: 293190

URL: http://llvm.org/viewvc/llvm-project?rev=293190&view=rev
Log:
Use TargetMachine adjustPassManager hook

Differential Revision: https://reviews.llvm.org/D28340

Modified:
cfe/trunk/lib/CodeGen/BackendUtil.cpp

Modified: cfe/trunk/lib/CodeGen/BackendUtil.cpp
URL: 
http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/BackendUtil.cpp?rev=293190&r1=293189&r2=293190&view=diff
==
--- cfe/trunk/lib/CodeGen/BackendUtil.cpp (original)
+++ cfe/trunk/lib/CodeGen/BackendUtil.cpp Thu Jan 26 10:49:21 2017
@@ -334,13 +334,8 @@ void EmitAssemblyHelper::CreatePasses(le
 
   MPM.add(new TargetLibraryInfoWrapperPass(*TLII));
 
-  // Add target-specific passes that need to run as early as possible.
   if (TM)
-PMBuilder.addExtension(
-PassManagerBuilder::EP_EarlyAsPossible,
-[&](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
-  TM->addEarlyAsPossiblePasses(PM);
-});
+TM->adjustPassManager(PMBuilder);
 
   PMBuilder.addExtension(PassManagerBuilder::EP_EarlyAsPossible,
  addAddDiscriminatorsPass);


___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

r299678 - [AMDGPU] Translate reqd_work_group_size into amdgpu_flat_work_group_size

2017-04-06 Thread Stanislav Mekhanoshin via cfe-commits

Author: rampitec
Date: Thu Apr  6 13:15:44 2017
New Revision: 299678

URL: http://llvm.org/viewvc/llvm-project?rev=299678&view=rev
Log:
[AMDGPU] Translate reqd_work_group_size into amdgpu_flat_work_group_size

These two attributes specify the same info in a different way.
AMGPU BE only checks the latter as a target specific attribute
as opposed to language specific reqd_work_group_size.

This change produces amdgpu_flat_work_group_size out of
reqd_work_group_size if specified.

Differential Revision: https://reviews.llvm.org/D31728

Modified:
cfe/trunk/lib/CodeGen/TargetInfo.cpp
cfe/trunk/test/CodeGenOpenCL/amdgpu-attrs.cl

Modified: cfe/trunk/lib/CodeGen/TargetInfo.cpp
URL: 
http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/TargetInfo.cpp?rev=299678&r1=299677&r2=299678&view=diff
==
--- cfe/trunk/lib/CodeGen/TargetInfo.cpp (original)
+++ cfe/trunk/lib/CodeGen/TargetInfo.cpp Thu Apr  6 13:15:44 2017
@@ -7302,9 +7302,14 @@ void AMDGPUTargetCodeGenInfo::setTargetA
 
   llvm::Function *F = cast(GV);
 
-  if (const auto *Attr = FD->getAttr()) {
-unsigned Min = Attr->getMin();
-unsigned Max = Attr->getMax();
+  const auto *ReqdWGS = M.getLangOpts().OpenCL ?
+FD->getAttr() : nullptr;
+  const auto *FlatWGS = FD->getAttr();
+  if (ReqdWGS || FlatWGS) {
+unsigned Min = FlatWGS ? FlatWGS->getMin() : 0;
+unsigned Max = FlatWGS ? FlatWGS->getMax() : 0;
+if (ReqdWGS && Min == 0 && Max == 0)
+  Min = Max = ReqdWGS->getXDim() * ReqdWGS->getYDim() * ReqdWGS->getZDim();
 
 if (Min != 0) {
   assert(Min <= Max && "Min must be less than or equal Max");

Modified: cfe/trunk/test/CodeGenOpenCL/amdgpu-attrs.cl
URL: 
http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGenOpenCL/amdgpu-attrs.cl?rev=299678&r1=299677&r2=299678&view=diff
==
--- cfe/trunk/test/CodeGenOpenCL/amdgpu-attrs.cl (original)
+++ cfe/trunk/test/CodeGenOpenCL/amdgpu-attrs.cl Thu Apr  6 13:15:44 2017
@@ -129,6 +129,16 @@ kernel void flat_work_group_size_32_64_w
 // CHECK: define amdgpu_kernel void 
@flat_work_group_size_32_64_waves_per_eu_2_4_num_sgpr_32_num_vgpr_64() 
[[FLAT_WORK_GROUP_SIZE_32_64_WAVES_PER_EU_2_4_NUM_SGPR_32_NUM_VGPR_64:#[0-9]+]]
 }
 
+__attribute__((reqd_work_group_size(32, 2, 1))) // expected-no-diagnostics
+kernel void reqd_work_group_size_32_2_1() {
+// CHECK: define amdgpu_kernel void @reqd_work_group_size_32_2_1() 
[[FLAT_WORK_GROUP_SIZE_64_64:#[0-9]+]]
+}
+__attribute__((reqd_work_group_size(32, 2, 1), amdgpu_flat_work_group_size(16, 
128))) // expected-no-diagnostics
+kernel void reqd_work_group_size_32_2_1_flat_work_group_size_16_128() {
+// CHECK: define amdgpu_kernel void 
@reqd_work_group_size_32_2_1_flat_work_group_size_16_128() 
[[FLAT_WORK_GROUP_SIZE_16_128:#[0-9]+]]
+}
+
+
 // Make sure this is silently accepted on other targets.
 // X86-NOT: "amdgpu-flat-work-group-size"
 // X86-NOT: "amdgpu-waves-per-eu"
@@ -142,6 +152,8 @@ kernel void flat_work_group_size_32_64_w
 // CHECK-NOT: "amdgpu-num-vgpr"="0"
 
 // CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64]] = { noinline nounwind 
"amdgpu-flat-work-group-size"="32,64"
+// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_64_64]] = { noinline nounwind 
"amdgpu-flat-work-group-size"="64,64"
+// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_16_128]] = { noinline nounwind 
"amdgpu-flat-work-group-size"="16,128"
 // CHECK-DAG: attributes [[WAVES_PER_EU_2]] = { noinline nounwind 
"amdgpu-waves-per-eu"="2"
 // CHECK-DAG: attributes [[WAVES_PER_EU_2_4]] = { noinline nounwind 
"amdgpu-waves-per-eu"="2,4"
 // CHECK-DAG: attributes [[NUM_SGPR_32]] = { noinline nounwind 
"amdgpu-num-sgpr"="32"


___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

r287006 - [AMDGPU] Add wave barrier builtin

2016-11-15 Thread Stanislav Mekhanoshin via cfe-commits

Author: rampitec
Date: Tue Nov 15 12:58:03 2016
New Revision: 287006

URL: http://llvm.org/viewvc/llvm-project?rev=287006&view=rev
Log:
[AMDGPU] Add wave barrier builtin

The wave barrier represents the discardable barrier. Its main purpose is to
carry convergent attribute, thus preventing illegal CFG optimizations. All lanes
in a wave come to convergence point simultaneously with SIMT, thus no special
instruction is needed in the ISA. The barrier is discarded during code 
generation.

Differential Revision: https://reviews.llvm.org/D26584

Modified:
cfe/trunk/include/clang/Basic/BuiltinsAMDGPU.def
cfe/trunk/test/CodeGenOpenCL/builtins-amdgcn.cl

Modified: cfe/trunk/include/clang/Basic/BuiltinsAMDGPU.def
URL: 
http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/BuiltinsAMDGPU.def?rev=287006&r1=287005&r2=287006&view=diff
==
--- cfe/trunk/include/clang/Basic/BuiltinsAMDGPU.def (original)
+++ cfe/trunk/include/clang/Basic/BuiltinsAMDGPU.def Tue Nov 15 12:58:03 2016
@@ -36,6 +36,7 @@ BUILTIN(__builtin_amdgcn_workitem_id_z,
 // Instruction builtins.
 
//===--===//
 BUILTIN(__builtin_amdgcn_s_barrier, "v", "n")
+BUILTIN(__builtin_amdgcn_wave_barrier, "v", "n")
 BUILTIN(__builtin_amdgcn_div_scale, "dddbb*", "n")
 BUILTIN(__builtin_amdgcn_div_scalef, "fffbb*", "n")
 BUILTIN(__builtin_amdgcn_div_fmas, "b", "nc")

Modified: cfe/trunk/test/CodeGenOpenCL/builtins-amdgcn.cl
URL: 
http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGenOpenCL/builtins-amdgcn.cl?rev=287006&r1=287005&r2=287006&view=diff
==
--- cfe/trunk/test/CodeGenOpenCL/builtins-amdgcn.cl (original)
+++ cfe/trunk/test/CodeGenOpenCL/builtins-amdgcn.cl Tue Nov 15 12:58:03 2016
@@ -270,6 +270,13 @@ void test_s_barrier()
   __builtin_amdgcn_s_barrier();
 }
 
+// CHECK-LABEL: @test_wave_barrier
+// CHECK: call void @llvm.amdgcn.wave.barrier(
+void test_wave_barrier()
+{
+  __builtin_amdgcn_wave_barrier();
+}
+
 // CHECK-LABEL: @test_s_memtime
 // CHECK: call i64 @llvm.amdgcn.s.memtime()
 void test_s_memtime(global ulong* out)


___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] 8e661d3 - [AMDGPU] Set s-memtime-inst feature from clang

2021-02-01 Thread Stanislav Mekhanoshin via cfe-commits


Author: Stanislav Mekhanoshin
Date: 2021-02-01T14:20:43-08:00
New Revision: 8e661d3d9c52ea9e5e68cbf699701d9cfa071a8f

URL: 
https://github.com/llvm/llvm-project/commit/8e661d3d9c52ea9e5e68cbf699701d9cfa071a8f
DIFF: 
https://github.com/llvm/llvm-project/commit/8e661d3d9c52ea9e5e68cbf699701d9cfa071a8f.diff

LOG: [AMDGPU] Set s-memtime-inst feature from clang

Differential Revision: https://reviews.llvm.org/D95733

Added: 


Modified: 
clang/lib/Basic/Targets/AMDGPU.cpp
clang/test/CodeGenOpenCL/amdgpu-features.cl

Removed: 




diff  --git a/clang/lib/Basic/Targets/AMDGPU.cpp 
b/clang/lib/Basic/Targets/AMDGPU.cpp
index 91c1e83f61cb..e6450c9f9ff8 100644
--- a/clang/lib/Basic/Targets/AMDGPU.cpp
+++ b/clang/lib/Basic/Targets/AMDGPU.cpp
@@ -210,6 +210,7 @@ bool AMDGPUTargetInfo::initFeatureMap(
   Features["gfx9-insts"] = true;
   Features["gfx10-insts"] = true;
   Features["s-memrealtime"] = true;
+  Features["s-memtime-inst"] = true;
   break;
 case GK_GFX908:
   Features["dot3-insts"] = true;
@@ -252,6 +253,7 @@ bool AMDGPUTargetInfo::initFeatureMap(
 case GK_GFX602:
 case GK_GFX601:
 case GK_GFX600:
+  Features["s-memtime-inst"] = true;
   break;
 case GK_NONE:
   break;

diff  --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl 
b/clang/test/CodeGenOpenCL/amdgpu-features.cl
index 740521aa3bca..a796d07cb2e5 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -32,30 +32,30 @@
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1032 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX1032 %s
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1033 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX1033 %s
 
-// GFX600-NOT: "target-features"
-// GFX601-NOT: "target-features"
-// GFX602-NOT: "target-features"
-// GFX700: "target-features"="+ci-insts,+flat-address-space"
-// GFX701: "target-features"="+ci-insts,+flat-address-space"
-// GFX702: "target-features"="+ci-insts,+flat-address-space"
-// GFX703: "target-features"="+ci-insts,+flat-address-space"
-// GFX704: "target-features"="+ci-insts,+flat-address-space"
-// GFX705: "target-features"="+ci-insts,+flat-address-space"
-// GFX801: 
"target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+s-memrealtime"
-// GFX802: 
"target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+s-memrealtime"
-// GFX803: 
"target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+s-memrealtime"
-// GFX805: 
"target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+s-memrealtime"
-// GFX810: 
"target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+s-memrealtime"
-// GFX900: 
"target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime"
-// GFX902: 
"target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime"
-// GFX904: 
"target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime"
-// GFX906: 
"target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime"
-// GFX908: 
"target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+mai-insts,+s-memrealtime"
-// GFX909: 
"target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime"
-// GFX90C: 
"target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime"
-// GFX1010: 
"target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+flat-address-space,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime"
-// GFX1011: 
"target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime"
-// GFX1012: 
"target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime"
+// GFX600: "target-features"="+s-memtime-inst"
+// GFX601: "target-features"="+s-memtime-inst"
+// GFX602: "target-features"="+s-memtime-inst"
+// GFX700: "target-features"="+ci-insts,+flat-address-space,+s-memtime-inst"
+// GFX701: "target-features"="+ci-insts,+flat-address-space,+s-memtime-inst"
+// GFX702: "target-features"="+ci-insts,+flat-address-space,+s-memtime-inst"
+// GFX703: "target-features"="+ci-insts,+flat-address-space,+s-memtime-inst"
+// GFX704: "target-features"="+ci-insts,+flat-address-space,+s-memtime-inst"
+// GFX705: "target-features"="+ci-insts,+flat-address-space,+s-memtime-inst"
+// GFX801: 
"target-fe

[clang] 59691dc - [AMDGPU] Make ds fp atomics overloadable

2020-09-23 Thread Stanislav Mekhanoshin via cfe-commits


Author: Stanislav Mekhanoshin
Date: 2020-09-23T11:39:50-07:00
New Revision: 59691dc8740c7eada7fcf5552e0d2377780c6fb7

URL: 
https://github.com/llvm/llvm-project/commit/59691dc8740c7eada7fcf5552e0d2377780c6fb7
DIFF: 
https://github.com/llvm/llvm-project/commit/59691dc8740c7eada7fcf5552e0d2377780c6fb7.diff

LOG: [AMDGPU] Make ds fp atomics overloadable

Differential Revision: https://reviews.llvm.org/D87947

Added: 


Modified: 
clang/lib/CodeGen/CGBuiltin.cpp
clang/test/CodeGenCUDA/builtins-amdgcn.cu
clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
llvm/include/llvm/IR/IntrinsicsAMDGPU.td
llvm/test/CodeGen/AMDGPU/lds_atomic_f32.ll

Removed: 




diff  --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 3c7f13a006d0..92c537f32b59 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -14746,6 +14746,32 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() });
 return Builder.CreateCall(F, { Src0, Builder.getFalse() });
   }
+  case AMDGPU::BI__builtin_amdgcn_ds_faddf:
+  case AMDGPU::BI__builtin_amdgcn_ds_fminf:
+  case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: {
+Intrinsic::ID Intrin;
+switch (BuiltinID) {
+case AMDGPU::BI__builtin_amdgcn_ds_faddf:
+  Intrin = Intrinsic::amdgcn_ds_fadd;
+  break;
+case AMDGPU::BI__builtin_amdgcn_ds_fminf:
+  Intrin = Intrinsic::amdgcn_ds_fmin;
+  break;
+case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
+  Intrin = Intrinsic::amdgcn_ds_fmax;
+  break;
+}
+llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
+llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
+llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
+llvm::Value *Src3 = EmitScalarExpr(E->getArg(3));
+llvm::Value *Src4 = EmitScalarExpr(E->getArg(4));
+llvm::Function *F = CGM.getIntrinsic(Intrin, { Src1->getType() });
+llvm::FunctionType *FTy = F->getFunctionType();
+llvm::Type *PTy = FTy->getParamType(0);
+Src0 = Builder.CreatePointerBitCastOrAddrSpaceCast(Src0, PTy);
+return Builder.CreateCall(F, { Src0, Src1, Src2, Src3, Src4 });
+  }
   case AMDGPU::BI__builtin_amdgcn_read_exec: {
 CallInst *CI = cast(
   EmitSpecialRegisterBuiltin(*this, E, Int64Ty, Int64Ty, NormalRead, 
"exec"));

diff  --git a/clang/test/CodeGenCUDA/builtins-amdgcn.cu 
b/clang/test/CodeGenCUDA/builtins-amdgcn.cu
index c10eae96d71a..1c3a79064595 100644
--- a/clang/test/CodeGenCUDA/builtins-amdgcn.cu
+++ b/clang/test/CodeGenCUDA/builtins-amdgcn.cu
@@ -10,7 +10,7 @@ __global__ void use_dispatch_ptr(int* out) {
 }
 
 // CHECK-LABEL: @_Z12test_ds_fmaxf(
-// CHECK: call contract float @llvm.amdgcn.ds.fmax(float addrspace(3)* 
@_ZZ12test_ds_fmaxfE6shared, float %{{[^,]*}}, i32 0, i32 0, i1 false)
+// CHECK: call contract float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* 
@_ZZ12test_ds_fmaxfE6shared, float %{{[^,]*}}, i32 0, i32 0, i1 false)
 __global__
 void test_ds_fmax(float src) {
   __shared__ float shared;

diff  --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
index 5884f84ab081..4408b043296a 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
@@ -114,19 +114,19 @@ void test_update_dpp(global int* out, int arg1, int arg2)
 }
 
 // CHECK-LABEL: @test_ds_fadd
-// CHECK: call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %out, float 
%src, i32 0, i32 0, i1 false)
+// CHECK: call float @llvm.amdgcn.ds.fadd.f32(float addrspace(3)* %out, float 
%src, i32 0, i32 0, i1 false)
 void test_ds_faddf(local float *out, float src) {
   *out = __builtin_amdgcn_ds_faddf(out, src, 0, 0, false);
 }
 
 // CHECK-LABEL: @test_ds_fmin
-// CHECK: call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %out, float 
%src, i32 0, i32 0, i1 false)
+// CHECK: call float @llvm.amdgcn.ds.fmin.f32(float addrspace(3)* %out, float 
%src, i32 0, i32 0, i1 false)
 void test_ds_fminf(local float *out, float src) {
   *out = __builtin_amdgcn_ds_fminf(out, src, 0, 0, false);
 }
 
 // CHECK-LABEL: @test_ds_fmax
-// CHECK: call float @llvm.amdgcn.ds.fmax(float addrspace(3)* %out, float 
%src, i32 0, i32 0, i1 false)
+// CHECK: call float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* %out, float 
%src, i32 0, i32 0, i1 false)
 void test_ds_fmaxf(local float *out, float src) {
   *out = __builtin_amdgcn_ds_fmaxf(out, src, 0, 0, false);
 }

diff  --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td 
b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 3df07e81e95b..918ab3efc0ad 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -397,11 +397,10 @@ class AMDGPUAtomicIncIntrin : Intrinsic<[llvm_anyint_ty],
 def int_amdgcn_atomic_inc : AMDGPUAtomicIncIntrin;
 def int_amdgcn_atomic_dec : AMDGPUAtomicIncIntrin;
 
-cl

[clang] 4fcdfc4 - [AMDGPU] Simplify amdgpu-macros.cl test. NFC.

2020-11-05 Thread Stanislav Mekhanoshin via cfe-commits


Author: Stanislav Mekhanoshin
Date: 2020-11-05T16:29:16-08:00
New Revision: 4fcdfc4398bdf9295cd0259d6416a7dc1d2da47f

URL: 
https://github.com/llvm/llvm-project/commit/4fcdfc4398bdf9295cd0259d6416a7dc1d2da47f
DIFF: 
https://github.com/llvm/llvm-project/commit/4fcdfc4398bdf9295cd0259d6416a7dc1d2da47f.diff

LOG: [AMDGPU] Simplify amdgpu-macros.cl test. NFC.

Differential Revision: https://reviews.llvm.org/D90886

Added: 


Modified: 
clang/test/Driver/amdgpu-macros.cl

Removed: 




diff  --git a/clang/test/Driver/amdgpu-macros.cl 
b/clang/test/Driver/amdgpu-macros.cl
index 87544e58c25d9..57b54acf85abf 100644
--- a/clang/test/Driver/amdgpu-macros.cl
+++ b/clang/test/Driver/amdgpu-macros.cl
@@ -5,66 +5,35 @@
 // R600-based processors.
 //
 
-// RUN: %clang -E -dM -target r600 -mcpu=r600 %s 2>&1 | FileCheck 
--check-prefixes=ARCH-R600,R600 %s
-// RUN: %clang -E -dM -target r600 -mcpu=rv630 %s 2>&1 | FileCheck 
--check-prefixes=ARCH-R600,R600 %s
-// RUN: %clang -E -dM -target r600 -mcpu=rv635 %s 2>&1 | FileCheck 
--check-prefixes=ARCH-R600,R600 %s
-// RUN: %clang -E -dM -target r600 -mcpu=r630 %s 2>&1 | FileCheck 
--check-prefixes=ARCH-R600,R630 %s
-// RUN: %clang -E -dM -target r600 -mcpu=rs780 %s 2>&1 | FileCheck 
--check-prefixes=ARCH-R600,RS880 %s
-// RUN: %clang -E -dM -target r600 -mcpu=rs880 %s 2>&1 | FileCheck 
--check-prefixes=ARCH-R600,RS880 %s
-// RUN: %clang -E -dM -target r600 -mcpu=rv610 %s 2>&1 | FileCheck 
--check-prefixes=ARCH-R600,RS880 %s
-// RUN: %clang -E -dM -target r600 -mcpu=rv620 %s 2>&1 | FileCheck 
--check-prefixes=ARCH-R600,RS880 %s
-// RUN: %clang -E -dM -target r600 -mcpu=rv670 %s 2>&1 | FileCheck 
--check-prefixes=ARCH-R600,RV670 %s
-// RUN: %clang -E -dM -target r600 -mcpu=rv710 %s 2>&1 | FileCheck 
--check-prefixes=ARCH-R600,RV710 %s
-// RUN: %clang -E -dM -target r600 -mcpu=rv730 %s 2>&1 | FileCheck 
--check-prefixes=ARCH-R600,RV730 %s
-// RUN: %clang -E -dM -target r600 -mcpu=rv740 %s 2>&1 | FileCheck 
--check-prefixes=ARCH-R600,RV770 %s
-// RUN: %clang -E -dM -target r600 -mcpu=rv770 %s 2>&1 | FileCheck 
--check-prefixes=ARCH-R600,RV770 %s
-// RUN: %clang -E -dM -target r600 -mcpu=cedar %s 2>&1 | FileCheck 
--check-prefixes=ARCH-R600,CEDAR %s
-// RUN: %clang -E -dM -target r600 -mcpu=palm %s 2>&1 | FileCheck 
--check-prefixes=ARCH-R600,CEDAR %s
-// RUN: %clang -E -dM -target r600 -mcpu=cypress %s 2>&1 | FileCheck 
--check-prefixes=ARCH-R600,CYPRESS %s
-// RUN: %clang -E -dM -target r600 -mcpu=hemlock %s 2>&1 | FileCheck 
--check-prefixes=ARCH-R600,CYPRESS %s
-// RUN: %clang -E -dM -target r600 -mcpu=juniper %s 2>&1 | FileCheck 
--check-prefixes=ARCH-R600,JUNIPER %s
-// RUN: %clang -E -dM -target r600 -mcpu=redwood %s 2>&1 | FileCheck 
--check-prefixes=ARCH-R600,REDWOOD %s
-// RUN: %clang -E -dM -target r600 -mcpu=sumo %s 2>&1 | FileCheck 
--check-prefixes=ARCH-R600,SUMO %s
-// RUN: %clang -E -dM -target r600 -mcpu=sumo2 %s 2>&1 | FileCheck 
--check-prefixes=ARCH-R600,SUMO %s
-// RUN: %clang -E -dM -target r600 -mcpu=barts %s 2>&1 | FileCheck 
--check-prefixes=ARCH-R600,BARTS %s
-// RUN: %clang -E -dM -target r600 -mcpu=caicos %s 2>&1 | FileCheck 
--check-prefixes=ARCH-R600,CAICOS %s
-// RUN: %clang -E -dM -target r600 -mcpu=aruba %s 2>&1 | FileCheck 
--check-prefixes=ARCH-R600,CAYMAN %s
-// RUN: %clang -E -dM -target r600 -mcpu=cayman %s 2>&1 | FileCheck 
--check-prefixes=ARCH-R600,CAYMAN %s
-// RUN: %clang -E -dM -target r600 -mcpu=turks %s 2>&1 | FileCheck 
--check-prefixes=ARCH-R600,TURKS %s
-
-// R600-NOT:#define FP_FAST_FMA 1
-// R630-NOT:#define FP_FAST_FMA 1
-// RS880-NOT:   #define FP_FAST_FMA 1
-// RV670-NOT:   #define FP_FAST_FMA 1
-// RV710-NOT:   #define FP_FAST_FMA 1
-// RV730-NOT:   #define FP_FAST_FMA 1
-// RV770-NOT:   #define FP_FAST_FMA 1
-// CEDAR-NOT:   #define FP_FAST_FMA 1
-// CYPRESS-NOT: #define FP_FAST_FMA 1
-// JUNIPER-NOT: #define FP_FAST_FMA 1
-// REDWOOD-NOT: #define FP_FAST_FMA 1
-// SUMO-NOT:#define FP_FAST_FMA 1
-// BARTS-NOT:   #define FP_FAST_FMA 1
-// CAICOS-NOT:  #define FP_FAST_FMA 1
-// CAYMAN-NOT:  #define FP_FAST_FMA 1
-// TURKS-NOT:   #define FP_FAST_FMA 1
-
-// R600-NOT:#define FP_FAST_FMAF 1
-// R630-NOT:#define FP_FAST_FMAF 1
-// RS880-NOT:   #define FP_FAST_FMAF 1
-// RV670-NOT:   #define FP_FAST_FMAF 1
-// RV710-NOT:   #define FP_FAST_FMAF 1
-// RV730-NOT:   #define FP_FAST_FMAF 1
-// RV770-NOT:   #define FP_FAST_FMAF 1
-// CEDAR-NOT:   #define FP_FAST_FMAF 1
-// CYPRESS-NOT: #define FP_FAST_FMAF 1
-// JUNIPER-NOT: #define FP_FAST_FMAF 1
-// REDWOOD-NOT: #define FP_FAST_FMAF 1
-// SUMO-NOT:#define FP_FAST_FMAF 1
-// BARTS-NOT:   #define FP_FAST_FMAF 1
-// CAICOS-NOT:  #define FP_FAST_FMAF 1
-// CAYMAN-NOT:  #define FP_FAST_FMAF 1
-// TURKS-NOT:   #define FP_FAST_FMAF 1
+// RUN: %clang -E -dM -target r600 -mcpu=r600 %s 2>&1 | FileCheck 
--check-prefixes=ARCH-R600,R600 %s -DCPU=r600
+// RUN: %clang -E -dM -target r600

[clang] d1beb95 - [AMDGPU] gfx1032 target

2020-10-15 Thread Stanislav Mekhanoshin via cfe-commits


Author: Stanislav Mekhanoshin
Date: 2020-10-15T12:41:18-07:00
New Revision: d1beb95d1241ec50bdf19db351d273374a146a4a

URL: 
https://github.com/llvm/llvm-project/commit/d1beb95d1241ec50bdf19db351d273374a146a4a
DIFF: 
https://github.com/llvm/llvm-project/commit/d1beb95d1241ec50bdf19db351d273374a146a4a.diff

LOG: [AMDGPU] gfx1032 target

Differential Revision: https://reviews.llvm.org/D89487

Added: 


Modified: 
clang/include/clang/Basic/Cuda.h
clang/lib/Basic/Targets/AMDGPU.cpp
clang/lib/Basic/Targets/NVPTX.cpp
clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
clang/test/CodeGenOpenCL/amdgpu-features.cl
clang/test/Driver/amdgpu-macros.cl
clang/test/Driver/amdgpu-mcpu.cl
llvm/docs/AMDGPUUsage.rst
llvm/include/llvm/BinaryFormat/ELF.h
llvm/include/llvm/Support/TargetParser.h
llvm/lib/ObjectYAML/ELFYAML.cpp
llvm/lib/Support/TargetParser.cpp
llvm/lib/Target/AMDGPU/GCNProcessors.td
llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll
llvm/test/CodeGen/AMDGPU/hsa-note-no-func.ll
llvm/test/MC/AMDGPU/gfx1030_err.s
llvm/test/MC/AMDGPU/gfx1030_new.s
llvm/test/MC/Disassembler/AMDGPU/gfx1030_dasm_new.txt
llvm/tools/llvm-readobj/ELFDumper.cpp

Removed: 




diff  --git a/clang/include/clang/Basic/Cuda.h 
b/clang/include/clang/Basic/Cuda.h
index 417d40c28adf..501e47b0e2c2 100644
--- a/clang/include/clang/Basic/Cuda.h
+++ b/clang/include/clang/Basic/Cuda.h
@@ -80,6 +80,7 @@ enum class CudaArch {
   GFX1012,
   GFX1030,
   GFX1031,
+  GFX1032,
   LAST,
 };
 

diff  --git a/clang/lib/Basic/Targets/AMDGPU.cpp 
b/clang/lib/Basic/Targets/AMDGPU.cpp
index 42db207b9ce5..0b3aebdfa15c 100644
--- a/clang/lib/Basic/Targets/AMDGPU.cpp
+++ b/clang/lib/Basic/Targets/AMDGPU.cpp
@@ -174,6 +174,7 @@ bool AMDGPUTargetInfo::initFeatureMap(
   // XXX - What does the member GPU mean if device name string passed here?
   if (isAMDGCN(getTriple())) {
 switch (llvm::AMDGPU::parseArchAMDGCN(CPU)) {
+case GK_GFX1032:
 case GK_GFX1031:
 case GK_GFX1030:
   Features["ci-insts"] = true;

diff  --git a/clang/lib/Basic/Targets/NVPTX.cpp 
b/clang/lib/Basic/Targets/NVPTX.cpp
index 3780f1cc250c..26c5b26beeef 100644
--- a/clang/lib/Basic/Targets/NVPTX.cpp
+++ b/clang/lib/Basic/Targets/NVPTX.cpp
@@ -205,6 +205,7 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions 
&Opts,
   case CudaArch::GFX1012:
   case CudaArch::GFX1030:
   case CudaArch::GFX1031:
+  case CudaArch::GFX1032:
   case CudaArch::LAST:
 break;
   case CudaArch::UNUSED:

diff  --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp 
b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index ab7ee8b33a0c..bcabc5398127 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -4645,6 +4645,7 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(
   case CudaArch::GFX1012:
   case CudaArch::GFX1030:
   case CudaArch::GFX1031:
+  case CudaArch::GFX1032:
   case CudaArch::UNUSED:
   case CudaArch::UNKNOWN:
 break;
@@ -4710,6 +4711,7 @@ static std::pair 
getSMsBlocksPerSM(CodeGenModule &CGM) {
   case CudaArch::GFX1012:
   case CudaArch::GFX1030:
   case CudaArch::GFX1031:
+  case CudaArch::GFX1032:
   case CudaArch::UNUSED:
   case CudaArch::UNKNOWN:
 break;

diff  --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl 
b/clang/test/CodeGenOpenCL/amdgpu-features.cl
index 5a814f36e564..a463c061114e 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -16,6 +16,7 @@
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1012 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX1012 %s
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1030 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX1030 %s
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1031 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX1031 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1032 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX1032 %s
 
 // GFX600-NOT: "target-features"
 // GFX601-NOT: "target-features"
@@ -30,5 +31,6 @@
 // GFX1012: 
"target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime"
 // GFX1030: 
"target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime"
 // GFX1031: 
"target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime"
+// GFX1032: 
"target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-i

[clang] 502b3bf - [AMDGPU] require s-memtime-inst for __builtin_amdgcn_s_memtime

2021-02-25 Thread Stanislav Mekhanoshin via cfe-commits


Author: Stanislav Mekhanoshin
Date: 2021-02-25T08:31:59-08:00
New Revision: 502b3bfc6a713e5b6640faf48e72de08d7cb0aba

URL: 
https://github.com/llvm/llvm-project/commit/502b3bfc6a713e5b6640faf48e72de08d7cb0aba
DIFF: 
https://github.com/llvm/llvm-project/commit/502b3bfc6a713e5b6640faf48e72de08d7cb0aba.diff

LOG: [AMDGPU] require s-memtime-inst for __builtin_amdgcn_s_memtime

Differential Revision: https://reviews.llvm.org/D97420

Added: 
clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1030.cl

Modified: 
clang/include/clang/Basic/BuiltinsAMDGPU.def
clang/test/CodeGenOpenCL/builtins-amdgcn-ci.cl
clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
clang/test/CodeGenOpenCL/builtins-amdgcn-gfx9.cl
clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
clang/test/CodeGenOpenCL/builtins-amdgcn.cl

Removed: 




diff  --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 3544abe35896..415d8cb3e73a 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -44,6 +44,8 @@ BUILTIN(__builtin_amdgcn_grid_size_z, "Ui", "nc")
 BUILTIN(__builtin_amdgcn_mbcnt_hi, "UiUiUi", "nc")
 BUILTIN(__builtin_amdgcn_mbcnt_lo, "UiUiUi", "nc")
 
+TARGET_BUILTIN(__builtin_amdgcn_s_memtime, "LUi", "n", "s-memtime-inst")
+
 
//===--===//
 // Instruction builtins.
 
//===--===//
@@ -105,7 +107,6 @@ BUILTIN(__builtin_amdgcn_cubeid, "", "nc")
 BUILTIN(__builtin_amdgcn_cubesc, "", "nc")
 BUILTIN(__builtin_amdgcn_cubetc, "", "nc")
 BUILTIN(__builtin_amdgcn_cubema, "", "nc")
-BUILTIN(__builtin_amdgcn_s_memtime, "LUi", "n")
 BUILTIN(__builtin_amdgcn_s_sleep, "vIi", "n")
 BUILTIN(__builtin_amdgcn_s_incperflevel, "vIi", "n")
 BUILTIN(__builtin_amdgcn_s_decperflevel, "vIi", "n")

diff  --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-ci.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-ci.cl
index b3bf3681b935..56da7eceb6de 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-ci.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-ci.cl
@@ -5,6 +5,7 @@
 // RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown 
-target-cpu gfx1010 -S -emit-llvm -o - %s | FileCheck %s
 
 typedef unsigned int uint;
+typedef unsigned long ulong;
 
 // CHECK-LABEL: @test_s_dcache_inv_vol
 // CHECK: call void @llvm.amdgcn.s.dcache.inv.vol(
@@ -27,6 +28,13 @@ void test_gws_sema_release_all(uint id)
   __builtin_amdgcn_ds_gws_sema_release_all(id);
 }
 
+// CHECK-LABEL: @test_s_memtime
+// CHECK: call i64 @llvm.amdgcn.s.memtime()
+void test_s_memtime(global ulong* out)
+{
+  *out = __builtin_amdgcn_s_memtime();
+}
+
 // CHECK-LABEL: @test_is_shared(
 // CHECK: [[CAST:%[0-9]+]] = bitcast i32* %{{[0-9]+}} to i8*
 // CHECK: call i1 @llvm.amdgcn.is.shared(i8* [[CAST]]

diff  --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
index a90d55fa8a78..a9bbaa9c3f54 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
@@ -4,6 +4,7 @@
 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1012 -S 
-emit-llvm -o - %s | FileCheck %s
 
 typedef unsigned int uint;
+typedef unsigned long ulong;
 
 // CHECK-LABEL: @test_permlane16(
 // CHECK: call i32 @llvm.amdgcn.permlane16(i32 %a, i32 %b, i32 %c, i32 %d, i1 
false, i1 false)
@@ -22,3 +23,10 @@ void test_permlanex16(global uint* out, uint a, uint b, uint 
c, uint d) {
 void test_mov_dpp8(global uint* out, uint a) {
   *out = __builtin_amdgcn_mov_dpp8(a, 1);
 }
+
+// CHECK-LABEL: @test_s_memtime
+// CHECK: call i64 @llvm.amdgcn.s.memtime()
+void test_s_memtime(global ulong* out)
+{
+  *out = __builtin_amdgcn_s_memtime();
+}

diff  --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx9.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx9.cl
index 344af2b27670..420506ec083c 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx9.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx9.cl
@@ -3,6 +3,7 @@
 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -S 
-emit-llvm -o - %s | FileCheck %s
 
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+typedef unsigned long ulong;
 
 // CHECK-LABEL: @test_fmed3_f16
 // CHECK: call half @llvm.amdgcn.fmed3.f16(half %a, half %b, half %c)
@@ -10,3 +11,10 @@ void test_fmed3_f16(global half* out, half a, half b, half c)
 {
   *out = __builtin_amdgcn_fmed3h(a, b, c);
 }
+
+// CHECK-LABEL: @test_s_memtime
+// CHECK: call i64 @llvm.amdgcn.s.memtime()
+void test_s_memtime(global ulong* out)
+{
+  *out = __builtin_amdgcn_s_memtime();
+}

diff  --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
index 4408b043296a..49faf069c8c6 100644
--- a/clang/test/CodeGe

[clang] 21280d3 - [clang] SimpleMFlag helper in Options.td

2021-03-01 Thread Stanislav Mekhanoshin via cfe-commits


Author: Stanislav Mekhanoshin
Date: 2021-03-01T09:00:30-08:00
New Revision: 21280d35d652788309176831bd88257b58f674f9

URL: 
https://github.com/llvm/llvm-project/commit/21280d35d652788309176831bd88257b58f674f9
DIFF: 
https://github.com/llvm/llvm-project/commit/21280d35d652788309176831bd88257b58f674f9.diff

LOG: [clang] SimpleMFlag helper in Options.td

This is the new helper to create a boolean -m and -mno-
options.

Differential Revision: https://reviews.llvm.org/D97069

Added: 


Modified: 
clang/include/clang/Driver/Options.td

Removed: 




diff  --git a/clang/include/clang/Driver/Options.td 
b/clang/include/clang/Driver/Options.td
index f4da19324f02..75c4eff29f1c 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -289,6 +289,17 @@ multiclass OptOutFFlag, HelpText;
 }
 
+// Creates a positive and negative flags where both of them are prefixed with
+// "m", have help text specified for positive and negative option, and a Group
+// optionally specified by the opt_group argument, otherwise Group.
+multiclass SimpleMFlag {
+  def m#NAME : Flag<["-"], "m"#name>, Group,
+HelpText;
+  def mno_#NAME : Flag<["-"], "mno-"#name>, Group,
+HelpText;
+}
+
 
//===--===//
 // BoolOption
 
//===--===//
@@ -3138,23 +3149,18 @@ def mcode_object_version_EQ : Joined<["-"], 
"mcode-object-version=">, Group,
   MetaVarName<"">, Values<"2,3,4">;
 
-def mcode_object_v3_legacy : Flag<["-"], "mcode-object-v3">, Group,
-  HelpText<"Legacy option to specify code object ABI V2 (-mnocode-object-v3) 
or V3 (-mcode-object-v3) (AMDGPU only)">;
-def mno_code_object_v3_legacy : Flag<["-"], "mno-code-object-v3">, 
Group;
-
-def mcumode : Flag<["-"], "mcumode">, Group,
-  HelpText<"Specify CU (-mcumode) or WGP (-mno-cumode) wavefront execution 
mode (AMDGPU only)">;
-def mno_cumode : Flag<["-"], "mno-cumode">, Group;
-
-def mtgsplit : Flag<["-"], "mtgsplit">, Group,
-  HelpText<"Enable threadgroup split execution mode (AMDGPU only)">;
-def mno_tgsplit : Flag<["-"], "mno-tgsplit">, Group,
-  HelpText<"Disable threadgroup split execution mode (AMDGPU only)">;
-
-def mwavefrontsize64 : Flag<["-"], "mwavefrontsize64">, Group,
-  HelpText<"Specify wavefront size 64 mode (AMDGPU only)">;
-def mno_wavefrontsize64 : Flag<["-"], "mno-wavefrontsize64">, Group,
-  HelpText<"Specify wavefront size 32 mode (AMDGPU only)">;
+defm code_object_v3_legacy : SimpleMFlag<"code-object-v3",
+  "Legacy option to specify code object ABI V3",
+  "Legacy option to specify code object ABI V2",
+  " (AMDGPU only)">;
+defm cumode : SimpleMFlag<"cumode",
+  "Specify CU wavefront", "Specify WGP wavefront",
+  " execution mode (AMDGPU only)", m_amdgpu_Features_Group>;
+defm tgsplit : SimpleMFlag<"tgsplit", "Enable", "Disable",
+  " threadgroup split execution mode (AMDGPU only)", m_amdgpu_Features_Group>;
+defm wavefrontsize64 : SimpleMFlag<"wavefrontsize64",
+  "Specify wavefront size 64", "Specify wavefront size 32",
+  " mode (AMDGPU only)">;
 
 defm unsafe_fp_atomics : BoolOption<"m", "unsafe-fp-atomics",
   TargetOpts<"AllowAMDGPUUnsafeFPAtomics">, DefaultFalse,



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] 189310a - [AMDGPU] Allow -amdgpu-unsafe-fp-atomics to ignore denorm mode

2021-04-08 Thread Stanislav Mekhanoshin via cfe-commits


Author: Stanislav Mekhanoshin
Date: 2021-04-08T12:46:36-07:00
New Revision: 189310a140fa1c33f8f4838560f567bab9e99245

URL: 
https://github.com/llvm/llvm-project/commit/189310a140fa1c33f8f4838560f567bab9e99245
DIFF: 
https://github.com/llvm/llvm-project/commit/189310a140fa1c33f8f4838560f567bab9e99245.diff

LOG: [AMDGPU] Allow -amdgpu-unsafe-fp-atomics to ignore denorm mode

Fixes: SWDEV-274276

Differential Revision: https://reviews.llvm.org/D100072

Added: 


Modified: 
clang/docs/ClangCommandLineReference.rst
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll

Removed: 




diff  --git a/clang/docs/ClangCommandLineReference.rst 
b/clang/docs/ClangCommandLineReference.rst
index d895587c458a4..97812f2b6e29e 100644
--- a/clang/docs/ClangCommandLineReference.rst
+++ b/clang/docs/ClangCommandLineReference.rst
@@ -2997,6 +2997,10 @@ Enable threadgroup split execution mode (AMDGPU only)
 
 Specify XNACK mode (AMDGPU only)
 
+.. option:: -munsafe-fp-atomics, -mno-unsafe-fp-atomics
+
+Enable generation of unsafe floating point atomic instructions. May generate 
more efficient code, but may not respect rounding and denormal modes, and may 
give incorrect results for certain memory destinations. (AMDGPU only)
+
 ARM
 ---
 .. option:: -faapcs-bitfield-load

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index e8cfca1c726e1..596612e79036e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -12055,9 +12055,14 @@ 
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
 
 if ((AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) &&
  Subtarget->hasAtomicFaddInsts()) {
-  if (!fpModeMatchesGlobalFPAtomicMode(RMW) ||
-  RMW->getFunction()->getFnAttribute("amdgpu-unsafe-fp-atomics")
-  .getValueAsString() != "true")
+  // The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe
+  // floating point atomic instructions. May generate more efficient code,
+  // but may not respect rounding and denormal modes, and may give 
incorrect
+  // results for certain memory destinations.
+  if (!fpModeMatchesGlobalFPAtomicMode(RMW) &&
+  RMW->getFunction()
+  ->getFnAttribute("amdgpu-unsafe-fp-atomics")
+  .getValueAsString() != "true")
 return AtomicExpansionKind::CmpXChg;
 
   if (Subtarget->hasGFX90AInsts()) {

diff  --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll 
b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
index 3047d4a3eff38..d5348038f3c3d 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
@@ -490,25 +490,13 @@ define amdgpu_kernel void 
@global_atomic_fadd_f64_noret_pat_flush(double addrspa
 ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_flush:
 ; GFX90A:   ; %bb.0: ; %main_body
 ; GFX90A-NEXT:s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT:s_mov_b64 s[2:3], 0
-; GFX90A-NEXT:s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX90A-NEXT:s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
-; GFX90A-NEXT:  BB27_1: ; %atomicrmw.start
-; GFX90A-NEXT:; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:v_mov_b32_e32 v4, 0
-; GFX90A-NEXT:v_add_f64 v[0:1], v[2:3], 4.0
+; GFX90A-NEXT:v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:v_mov_b32_e32 v1, 0x4010
 ; GFX90A-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX90A-NEXT:global_atomic_add_f64 v2, v[0:1], s[0:1]
 ; GFX90A-NEXT:s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:buffer_wbinvl1_vol
-; GFX90A-NEXT:v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT:s_or_b64 s[2:3], vcc, s[2:3]
-; GFX90A-NEXT:v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT:s_andn2_b64 exec, exec, s[2:3]
-; GFX90A-NEXT:s_cbranch_execnz BB27_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:s_endpgm
 main_body:
   %ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 
syncscope("agent") seq_cst

diff  --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll 
b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
index ecc914123405d..143ef2d14c3d5 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
@@ -171,26 +171,12 @@ define amdgpu_kernel void 
@global_atomic_fadd_ret_f32_ieee(float addrspace(1)* %
 ; GFX90A-LABEL: global_atomic_fadd_ret_f32_ieee:
 ; GFX90A:   ; %bb.0:
 ; GFX90A-NEXT:s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT:s_mov_b64 s[2:3], 0
-; GFX90A-NEXT:s_waitcnt

[clang] c714d03 - [AMDGPU] Expose __builtin_amdgcn_perm for v_perm_b32

2021-05-06 Thread Stanislav Mekhanoshin via cfe-commits


Author: Stanislav Mekhanoshin
Date: 2021-05-06T16:17:33-07:00
New Revision: c714d037857f9c8e3bbe32e22ec22279121c378d

URL: 
https://github.com/llvm/llvm-project/commit/c714d037857f9c8e3bbe32e22ec22279121c378d
DIFF: 
https://github.com/llvm/llvm-project/commit/c714d037857f9c8e3bbe32e22ec22279121c378d.diff

LOG: [AMDGPU] Expose __builtin_amdgcn_perm for v_perm_b32

Differential Revision: https://reviews.llvm.org/D102022

Added: 
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.perm.ll

Modified: 
clang/include/clang/Basic/BuiltinsAMDGPU.def
clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
clang/test/SemaOpenCL/builtins-amdgcn-error-vi.cl
llvm/include/llvm/IR/IntrinsicsAMDGPU.td
llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Removed: 




diff  --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 9677b1aadb518..7dcbf9a096961 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -182,6 +182,7 @@ TARGET_BUILTIN(__builtin_amdgcn_s_memrealtime, "LUi", "n", 
"s-memrealtime")
 TARGET_BUILTIN(__builtin_amdgcn_mov_dpp, "iiIiIiIiIb", "nc", "dpp")
 TARGET_BUILTIN(__builtin_amdgcn_update_dpp, "iiiIiIiIiIb", "nc", "dpp")
 TARGET_BUILTIN(__builtin_amdgcn_s_dcache_wb, "v", "n", "gfx8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_perm, "UiUiUiUi", "nc", "gfx8-insts")
 
 
//===--===//
 // GFX9+ only builtins.

diff  --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
index 49faf069c8c68..be594585fad6d 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
@@ -7,6 +7,7 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
 typedef unsigned long ulong;
+typedef unsigned int  uint;
 
 // CHECK-LABEL: @test_div_fixup_f16
 // CHECK: call half @llvm.amdgcn.div.fixup.f16
@@ -137,3 +138,10 @@ void test_s_memtime(global ulong* out)
 {
   *out = __builtin_amdgcn_s_memtime();
 }
+
+// CHECK-LABEL: @test_perm
+// CHECK: call i32 @llvm.amdgcn.perm(i32 %a, i32 %b, i32 %s)
+void test_perm(global uint* out, uint a, uint b, uint s)
+{
+  *out = __builtin_amdgcn_perm(a, b, s);
+}

diff  --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-vi.cl 
b/clang/test/SemaOpenCL/builtins-amdgcn-error-vi.cl
index 849c826c5b8cb..c23f4a452d83e 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-vi.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-vi.cl
@@ -2,7 +2,8 @@
 // RUN: %clang_cc1 -triple amdgcn-- -target-cpu tahiti -verify -S -o - %s
 // RUN: %clang_cc1 -triple amdgcn-- -target-cpu hawaii -verify -S -o - %s
 
-void test_vi_s_dcache_wb()
+void test_vi_builtins()
 {
   __builtin_amdgcn_s_dcache_wb(); // expected-error 
{{'__builtin_amdgcn_s_dcache_wb' needs target feature gfx8-insts}}
+  (void)__builtin_amdgcn_perm(1, 2, 3); // expected-error 
{{'__builtin_amdgcn_perm' needs target feature gfx8-insts}}
 }

diff  --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td 
b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 7b62b9de79b22..46a7aeb39c9a5 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1716,6 +1716,12 @@ def int_amdgcn_ds_bpermute :
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
  [IntrNoMem, IntrConvergent, IntrWillReturn]>;
 
+// llvm.amdgcn.perm   
+def int_amdgcn_perm :
+  GCCBuiltin<"__builtin_amdgcn_perm">,
+  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+ [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
+
 
//===--===//
 // GFX10 Intrinsics
 
//===--===//

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td 
b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index c0cb1781abe34..d63bd2e9eb2e0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -313,7 +313,7 @@ def AMDGPUfdot2_impl : SDNode<"AMDGPUISD::FDOT2",
SDTCisInt<4>]>,
   []>;
 
-def AMDGPUperm : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>;
+def AMDGPUperm_impl : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>;
 
 // SI+ export
 def AMDGPUExportOp : SDTypeProfile<0, 8, [
@@ -463,3 +463,7 @@ def AMDGPUfdot2 : PatFrags<(ops node:$src0, node:$src1, 
node:$src2, node:$clamp)
 def AMDGPUdiv_fmas : PatFrags<(ops node:$src0, node:$src1, node:$src2, 
node:$vcc),
   [(int_amdgcn_div_fmas node:$src0, node:$src1, node:$src2, node:$vcc),
(AMDGPUdiv_fmas_impl node:$src0, node:$src1, node:$src2, node:$vcc)]>;
+
+def AMDGPUperm : PatFrags<(ops node:$src0, node:$src1, node:$src2),
+

[clang] 932f628 - [AMDGPU] new gfx940 fp atomics

2022-03-07 Thread Stanislav Mekhanoshin via cfe-commits


Author: Stanislav Mekhanoshin
Date: 2022-03-07T12:32:02-08:00
New Revision: 932f628121d85281ef1a2410dd158a735acdea5e

URL: 
https://github.com/llvm/llvm-project/commit/932f628121d85281ef1a2410dd158a735acdea5e
DIFF: 
https://github.com/llvm/llvm-project/commit/932f628121d85281ef1a2410dd158a735acdea5e.diff

LOG: [AMDGPU] new gfx940 fp atomics

Differential Revision: https://reviews.llvm.org/D121028

Added: 
clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx90a-err.cl
clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl
llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll

Modified: 
clang/include/clang/Basic/BuiltinsAMDGPU.def
clang/lib/CodeGen/CGBuiltin.cpp
llvm/include/llvm/IR/IntrinsicsAMDGPU.td
llvm/lib/Target/AMDGPU/AMDGPU.td
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
llvm/lib/Target/AMDGPU/DSInstructions.td
llvm/lib/Target/AMDGPU/FLATInstructions.td
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/test/MC/AMDGPU/gfx940_asm_features.s
llvm/test/MC/Disassembler/AMDGPU/gfx940_dasm_features.txt

Removed: 




diff  --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 2e1d3c7ccbff9..3b7ff75a9410a 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -209,6 +209,12 @@ TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fmax_f64, 
"dd*0d", "t", "gfx90a-inst
 TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_f64, "dd*3d", "t", 
"gfx90a-insts")
 TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_f32, "ff*3f", "t", "gfx8-insts")
 
+TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_f32, "ff*0f", "t", 
"gfx940-insts")
+TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_v2f16, "V2hV2h*0V2h", "t", 
"gfx940-insts")
+TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_v2bf16, "V2sV2s*0V2s", "t", 
"gfx940-insts")
+TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_v2bf16, "V2sV2s*1V2s", "t", 
"gfx940-insts")
+TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2bf16, "V2sV2s*3V2s", "t", 
"gfx940-insts")
+
 
//===--===//
 // Deep learning builtins.
 
//===--===//

diff  --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index acbeac326ece6..603ae6e352b40 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -16513,7 +16513,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: {
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16: {
 Intrinsic::ID IID;
 llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
 switch (BuiltinID) {
@@ -16544,6 +16546,15 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
   IID = Intrinsic::amdgcn_flat_atomic_fmax;
   break;
+case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
+  ArgTy = llvm::Type::getFloatTy(getLLVMContext());
+  IID = Intrinsic::amdgcn_flat_atomic_fadd;
+  break;
+case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
+  ArgTy = llvm::FixedVectorType::get(
+  llvm::Type::getHalfTy(getLLVMContext()), 2);
+  IID = Intrinsic::amdgcn_flat_atomic_fadd;
+  break;
 }
 llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
 llvm::Value *Val = EmitScalarExpr(E->getArg(1));
@@ -16551,6 +16562,22 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 CGM.getIntrinsic(IID, {ArgTy, Addr->getType(), Val->getType()});
 return Builder.CreateCall(F, {Addr, Val});
   }
+  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16: {
+Intrinsic::ID IID;
+switch (BuiltinID) {
+case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
+  IID = Intrinsic::amdgcn_global_atomic_fadd_v2bf16;
+  break;
+case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
+  IID = Intrinsic::amdgcn_flat_atomic_fadd_v2bf16;
+  break;
+}
+llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
+llvm::Value *Val = EmitScalarExpr(E->getArg(1));
+llvm::Function *F = CGM.getIntrinsic(IID, {Addr->getType()});
+return Builder.CreateCall(F, {Addr, Val});
+  }
   case AMDGPU::BI__builtin_amdgcn_ds_ato

[clang] 9eabea3 - [AMDGPU] Set noclobber metadata on loads instead of cast to constant

2022-03-07 Thread Stanislav Mekhanoshin via cfe-commits


Author: Stanislav Mekhanoshin
Date: 2022-03-07T23:13:02-08:00
New Revision: 9eabea396814c5580978cd4766b524bef57844cf

URL: 
https://github.com/llvm/llvm-project/commit/9eabea396814c5580978cd4766b524bef57844cf
DIFF: 
https://github.com/llvm/llvm-project/commit/9eabea396814c5580978cd4766b524bef57844cf.diff

LOG: [AMDGPU] Set noclobber metadata on loads instead of cast to constant

A load via pointer cast to constant will return true from
pointsToConstantMemory which is not necessarily so.

Fixes: SWDEV-326463

Differential Revision: https://reviews.llvm.org/D121172

Added: 


Modified: 
clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp
llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll

Removed: 




diff  --git a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu 
b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
index 01e0d3db46127..8796c8197ed16 100644
--- a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
+++ b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
@@ -18,7 +18,7 @@
 // COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel1Pi(i32 
addrspace(1)*{{.*}} %x.coerce)
 // CHECK: ={{.*}} addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to 
[[TYPE]]*
 // CHECK-NOT: ={{.*}} addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to 
[[TYPE]]*
-// OPT: [[VAL:%.*]] = load i32, i32 addrspace(4)* %x.coerce.const, align 4
+// OPT: [[VAL:%.*]] = load i32, i32 addrspace(1)* %x.coerce, align 4, 
!amdgpu.noclobber !2
 // OPT: [[INC:%.*]] = add nsw i32 [[VAL]], 1
 // OPT: store i32 [[INC]], i32 addrspace(1)* %x.coerce, align 4
 // OPT: ret void
@@ -30,7 +30,7 @@ __global__ void kernel1(int *x) {
 // COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel2Ri(i32 
addrspace(1)*{{.*}} nonnull align 4 dereferenceable(4) %x.coerce)
 // CHECK: ={{.*}} addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to 
[[TYPE]]*
 // CHECK-NOT: ={{.*}} addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to 
[[TYPE]]*
-// OPT: [[VAL:%.*]] = load i32, i32 addrspace(4)* %x.coerce.const, align 4
+// OPT: [[VAL:%.*]] = load i32, i32 addrspace(1)* %x.coerce, align 4, 
!amdgpu.noclobber !2
 // OPT: [[INC:%.*]] = add nsw i32 [[VAL]], 1
 // OPT: store i32 [[INC]], i32 addrspace(1)* %x.coerce, align 4
 // OPT: ret void
@@ -68,8 +68,7 @@ struct S {
 // OPT: [[R1:%.*]] = getelementptr inbounds %struct.S, %struct.S addrspace(4)* 
%0, i64 0, i32 1
 // OPT: [[P1:%.*]] = load float*, float* addrspace(4)* [[R1]], align 8
 // OPT: [[G1:%.*]] ={{.*}} addrspacecast float* [[P1]] to float addrspace(1)*
-// OPT: [[G2:%.*]] ={{.*}} addrspacecast i32* [[P0]] to i32 addrspace(4)*
-// OPT: [[V0:%.*]] = load i32, i32 addrspace(4)* [[G2]], align 4
+// OPT: [[V0:%.*]] = load i32, i32 addrspace(1)* [[G0]], align 4, 
!amdgpu.noclobber !2
 // OPT: [[INC:%.*]] = add nsw i32 [[V0]], 1
 // OPT: store i32 [[INC]], i32 addrspace(1)* [[G0]], align 4
 // OPT: [[V1:%.*]] = load float, float addrspace(1)* [[G1]], align 4
@@ -104,8 +103,7 @@ struct T {
 // OPT: [[R1:%.*]] = getelementptr inbounds %struct.T, %struct.T addrspace(4)* 
%0, i64 0, i32 0, i64 1
 // OPT: [[P1:%.*]] = load float*, float* addrspace(4)* [[R1]], align 8
 // OPT: [[G1:%.*]] ={{.*}} addrspacecast float* [[P1]] to float addrspace(1)*
-// OPT: [[G2:%.*]] ={{.*}} addrspacecast float* [[P0]] to float addrspace(4)*
-// OPT: [[V0:%.*]] = load float, float addrspace(4)* [[G2]], align 4
+// OPT: [[V0:%.*]] = load float, float addrspace(1)* [[G0]], align 4, 
!amdgpu.noclobber !2
 // OPT: [[ADD0:%.*]] = fadd contract float [[V0]], 1.00e+00
 // OPT: store float [[ADD0]], float addrspace(1)* [[G0]], align 4
 // OPT: [[V1:%.*]] = load float, float addrspace(1)* [[G1]], align 4
@@ -132,7 +130,7 @@ struct SS {
 // COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel82SS(float 
addrspace(1)*{{.*}} %a.coerce)
 // CHECK: ={{.*}} addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to 
[[TYPE]]*
 // CHECK-NOT: ={{.*}} addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to 
[[TYPE]]*
-// OPT: [[VAL:%.*]] = load float, float addrspace(4)* %a.coerce.const, align 4
+// OPT: [[VAL:%.*]] = load float, float addrspace(1)* %a.coerce, align 4, 
!amdgpu.noclobber !2
 // OPT: [[INC:%.*]] = fadd contract float [[VAL]], 3.00e+00
 // OPT: store float [[INC]], float addrspace(1)* %a.coerce, align 4
 // OPT: ret void

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp
index 65ad8b2aeacd3..ed450f59e4b30 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp
@@ -131,31 +131,7 @@ bool AMDGPUPromoteKernelArguments::promoteLoad(LoadInst 
*LI) {
   if (!LI->isSimple())
 return false;
 
-  Value *Ptr = LI->getPointerOperand();
-
-  // Strip casts we have created earlier.
-  Value *OrigPtr = Ptr

[clang] 9fa5a6b - [AMDGPU] Support for gfx940 fp8 conversions

2022-07-18 Thread Stanislav Mekhanoshin via cfe-commits


Author: Stanislav Mekhanoshin
Date: 2022-07-18T11:48:43-07:00
New Revision: 9fa5a6b7e8a292ec91b844a622836d2990ef5796

URL: 
https://github.com/llvm/llvm-project/commit/9fa5a6b7e8a292ec91b844a622836d2990ef5796
DIFF: 
https://github.com/llvm/llvm-project/commit/9fa5a6b7e8a292ec91b844a622836d2990ef5796.diff

LOG: [AMDGPU] Support for gfx940 fp8 conversions

Differential Revision: https://reviews.llvm.org/D129902

Added: 
clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll

Modified: 
clang/include/clang/Basic/BuiltinsAMDGPU.def
clang/lib/Basic/Targets/AMDGPU.cpp
clang/test/CodeGenOpenCL/amdgpu-features.cl
llvm/include/llvm/IR/IntrinsicsAMDGPU.td
llvm/lib/Target/AMDGPU/AMDGPU.td
llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
llvm/lib/Target/AMDGPU/GCNSubtarget.h
llvm/lib/Target/AMDGPU/SIInstrInfo.td
llvm/lib/Target/AMDGPU/VOP1Instructions.td
llvm/lib/Target/AMDGPU/VOP3Instructions.td
llvm/test/MC/AMDGPU/gfx940_asm_features.s
llvm/test/MC/AMDGPU/gfx940_err.s
llvm/test/MC/Disassembler/AMDGPU/gfx940_dasm_features.txt

Removed: 




diff  --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 68bcf546d177c..e9f25d783e596 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -346,5 +346,14 @@ TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x16_bf16, 
"V16fV4sV8sV16fiIiIi",
 TARGET_BUILTIN(__builtin_amdgcn_smfmac_i32_16x16x64_i8, "V4iV2iV4iV4iiIiIi", 
"nc", "mai-insts")
 TARGET_BUILTIN(__builtin_amdgcn_smfmac_i32_32x32x32_i8, "V16iV2iV4iV16iiIiIi", 
"nc", "mai-insts")
 
+TARGET_BUILTIN(__builtin_amdgcn_cvt_f32_bf8, "fiIi", "nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_f32_fp8, "fiIi", "nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f32_bf8, "V2fiIb", "nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f32_fp8, "V2fiIb", "nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_bf8_f32, "iffiIb", "nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32, "iffiIb", "nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_bf8_f32, "ifiiIi", "nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32, "ifiiIi", "nc", "fp8-insts")
+
 #undef BUILTIN
 #undef TARGET_BUILTIN

diff  --git a/clang/lib/Basic/Targets/AMDGPU.cpp 
b/clang/lib/Basic/Targets/AMDGPU.cpp
index 50256d8e210c9..80f2601b0a245 100644
--- a/clang/lib/Basic/Targets/AMDGPU.cpp
+++ b/clang/lib/Basic/Targets/AMDGPU.cpp
@@ -250,6 +250,7 @@ bool AMDGPUTargetInfo::initFeatureMap(
   break;
 case GK_GFX940:
   Features["gfx940-insts"] = true;
+  Features["fp8-insts"] = true;
   LLVM_FALLTHROUGH;
 case GK_GFX90A:
   Features["gfx90a-insts"] = true;

diff  --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl 
b/clang/test/CodeGenOpenCL/amdgpu-features.cl
index cb3a3eff01f70..ff288e530d17f 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -64,7 +64,7 @@
 // GFX909: 
"target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
 // GFX90A: 
"target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+mai-insts,+s-memrealtime,+s-memtime-inst"
 // GFX90C: 
"target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
-// GFX940: 
"target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst"
+// GFX940: 
"target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst"
 // GFX1010: 
"target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+flat-address-space,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
 // GFX1011: 
"target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
 // GFX1012: 
"target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"

diff  --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl
new file mode 100644

[clang] 2695f0a - [AMDGPU] Support for gfx940 fp8 mfma

2022-07-18 Thread Stanislav Mekhanoshin via cfe-commits


Author: Stanislav Mekhanoshin
Date: 2022-07-18T11:49:56-07:00
New Revision: 2695f0a688e9d26fcb0f3a4b686a2783f2eb145c

URL: 
https://github.com/llvm/llvm-project/commit/2695f0a688e9d26fcb0f3a4b686a2783f2eb145c
DIFF: 
https://github.com/llvm/llvm-project/commit/2695f0a688e9d26fcb0f3a4b686a2783f2eb145c.diff

LOG: [AMDGPU] Support for gfx940 fp8 mfma

Differential Revision: https://reviews.llvm.org/D129906

Added: 


Modified: 
clang/include/clang/Basic/BuiltinsAMDGPU.def
clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
clang/test/SemaOpenCL/builtins-amdgcn-error-gfx940-param.cl
llvm/include/llvm/IR/IntrinsicsAMDGPU.td
llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
llvm/lib/Target/AMDGPU/SIInstrInfo.td
llvm/lib/Target/AMDGPU/VOP3PInstructions.td
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx940.ll
llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select-gfx940.ll
llvm/test/MC/AMDGPU/mai-gfx940.s
llvm/test/MC/Disassembler/AMDGPU/mai-gfx940.txt

Removed: 




diff  --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index e9f25d783e596..e992e22ca527a 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -339,6 +339,14 @@ TARGET_BUILTIN(__builtin_amdgcn_mfma_i32_16x16x32_i8, 
"V4iWiWiV4iIiIiIi", "nc",
 TARGET_BUILTIN(__builtin_amdgcn_mfma_i32_32x32x16_i8, "V16iWiWiV16iIiIiIi", 
"nc", "mai-insts")
 TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x8_xf32, "V4fV2fV2fV4fIiIiIi", 
"nc", "mai-insts")
 TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x4_xf32, "V16fV2fV2fV16fIiIiIi", 
"nc", "mai-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x32_bf8_bf8, "V4fWiWiV4fIiIiIi", 
"nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x32_bf8_fp8, "V4fWiWiV4fIiIiIi", 
"nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x32_fp8_bf8, "V4fWiWiV4fIiIiIi", 
"nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x32_fp8_fp8, "V4fWiWiV4fIiIiIi", 
"nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x16_bf8_bf8, 
"V16fWiWiV16fIiIiIi", "nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x16_bf8_fp8, 
"V16fWiWiV16fIiIiIi", "nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x16_fp8_bf8, 
"V16fWiWiV16fIiIiIi", "nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8, 
"V16fWiWiV16fIiIiIi", "nc", "fp8-insts")
 TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_16x16x32_f16, "V4fV4hV8hV4fiIiIi", 
"nc", "mai-insts")
 TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x16_f16, 
"V16fV4hV8hV16fiIiIi", "nc", "mai-insts")
 TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_16x16x32_bf16, "V4fV4sV8sV4fiIiIi", 
"nc", "mai-insts")

diff  --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
index 8e3cc7e382e90..192bb1062381d 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
@@ -251,6 +251,62 @@ void test_mfma_f32_32x32x4_xf32(global v16f* out, v2f a, 
v2f b, v16f c)
   *out = __builtin_amdgcn_mfma_f32_32x32x4_xf32(a, b, c, 0, 0, 0);
 }
 
+// CHECK-GFX940-LABEL: @test_mfma_f32_16x16x32_bf8_bf8
+// CHECK-GFX940: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.bf8(i64 
%a, i64 %b, <4 x float> %c, i32 0, i32 0, i32 0)
+void test_mfma_f32_16x16x32_bf8_bf8(global v4f* out, long a, long b, v4f c)
+{
+  *out = __builtin_amdgcn_mfma_f32_16x16x32_bf8_bf8(a, b, c, 0, 0, 0);
+}
+
+// CHECK-GFX940-LABEL: @test_mfma_f32_16x16x32_bf8_fp8
+// CHECK-GFX940: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.fp8(i64 
%a, i64 %b, <4 x float> %c, i32 0, i32 0, i32 0)
+void test_mfma_f32_16x16x32_bf8_fp8(global v4f* out, long a, long b, v4f c)
+{
+  *out = __builtin_amdgcn_mfma_f32_16x16x32_bf8_fp8(a, b, c, 0, 0, 0);
+}
+
+// CHECK-GFX940-LABEL: @test_mfma_f32_16x16x32_fp8_bf8
+// CHECK-GFX940: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.bf8(i64 
%a, i64 %b, <4 x float> %c, i32 0, i32 0, i32 0)
+void test_mfma_f32_16x16x32_fp8_bf8(global v4f* out, long a, long b, v4f c)
+{
+  *out = __builtin_amdgcn_mfma_f32_16x16x32_fp8_bf8(a, b, c, 0, 0, 0);
+}
+
+// CHECK-GFX940-LABEL: @test_mfma_f32_16x16x32_fp8_fp8
+// CHECK-GFX940: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.fp8(i64 
%a, i64 %b, <4 x float> %c, i32 0, i32 0, i32 0)
+void test_mfma_f32_16x16x32_fp8_fp8(global v4f* out, long a, long b, v4f c)
+{
+  *out = __builtin_amdgcn_mfma_f32_16x16x32_fp8_fp8(a, b, c, 0, 0, 0);
+}
+
+// CHECK-GFX940-LABEL: @test_mfma_f32_32x32x16_bf8_bf8
+// CHECK-GFX940: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.bf8(i64 
%a, i64 %b, <16 x float> %c, i32 0, i32 0, i32 0)
+void test_mfma_f32_32x32x16_bf8_bf8(global v16f* out, long a, long b, v16f c)
+{
+  *out = __builtin_amdgcn_mfma_

[clang] 523a99c - [AMDGPU] Support for gfx940 fp8 smfmac

2022-07-18 Thread Stanislav Mekhanoshin via cfe-commits


Author: Stanislav Mekhanoshin
Date: 2022-07-18T12:12:41-07:00
New Revision: 523a99c0eb0331680905e9ef6fbdd114f4ee7a47

URL: 
https://github.com/llvm/llvm-project/commit/523a99c0eb0331680905e9ef6fbdd114f4ee7a47
DIFF: 
https://github.com/llvm/llvm-project/commit/523a99c0eb0331680905e9ef6fbdd114f4ee7a47.diff

LOG: [AMDGPU] Support for gfx940 fp8 smfmac

Differential Revision: https://reviews.llvm.org/D129908

Added: 


Modified: 
clang/include/clang/Basic/BuiltinsAMDGPU.def
clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
clang/test/SemaOpenCL/builtins-amdgcn-error-gfx940-param.cl
llvm/include/llvm/IR/IntrinsicsAMDGPU.td
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
llvm/lib/Target/AMDGPU/SIInstrInfo.td
llvm/lib/Target/AMDGPU/VOP3PInstructions.td
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx940.ll
llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select-gfx940.ll
llvm/test/MC/AMDGPU/mai-gfx940.s
llvm/test/MC/Disassembler/AMDGPU/mai-gfx940.txt

Removed: 




diff  --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index e992e22ca527..cdf5f5a85418 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -353,6 +353,14 @@ TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_16x16x32_bf16, 
"V4fV4sV8sV4fiIiIi", "
 TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x16_bf16, 
"V16fV4sV8sV16fiIiIi", "nc", "mai-insts")
 TARGET_BUILTIN(__builtin_amdgcn_smfmac_i32_16x16x64_i8, "V4iV2iV4iV4iiIiIi", 
"nc", "mai-insts")
 TARGET_BUILTIN(__builtin_amdgcn_smfmac_i32_32x32x32_i8, "V16iV2iV4iV16iiIiIi", 
"nc", "mai-insts")
+TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_16x16x64_bf8_bf8, 
"V4fV2iV4iV4fiIiIi", "nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_16x16x64_bf8_fp8, 
"V4fV2iV4iV4fiIiIi", "nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_16x16x64_fp8_bf8, 
"V4fV2iV4iV4fiIiIi", "nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_16x16x64_fp8_fp8, 
"V4fV2iV4iV4fiIiIi", "nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x32_bf8_bf8, 
"V16fV2iV4iV16fiIiIi", "nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x32_bf8_fp8, 
"V16fV2iV4iV16fiIiIi", "nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x32_fp8_bf8, 
"V16fV2iV4iV16fiIiIi", "nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x32_fp8_fp8, 
"V16fV2iV4iV16fiIiIi", "nc", "fp8-insts")
 
 TARGET_BUILTIN(__builtin_amdgcn_cvt_f32_bf8, "fiIi", "nc", "fp8-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_f32_fp8, "fiIi", "nc", "fp8-insts")

diff  --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
index 192bb1062381..1819ff0a6177 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
@@ -348,4 +348,60 @@ void test_smfmac_i32_32x32x32_i8(global v16i* out, v2i a, 
v4i b, v16i c, int idx
 {
   *out = __builtin_amdgcn_smfmac_i32_32x32x32_i8(a, b, c, idx, 0, 0);
 }
+
+// CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x64_bf8_bf8
+// CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.bf8(<2 
x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0)
+void test_smfmac_f32_16x16x64_bf8_bf8(global v4f* out, v2i a, v4i b, v4f c, 
int idx)
+{
+  *out = __builtin_amdgcn_smfmac_f32_16x16x64_bf8_bf8(a, b, c, idx, 0, 0);
+}
+
+// CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x64_bf8_fp8
+// CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.fp8(<2 
x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0)
+void test_smfmac_f32_16x16x64_bf8_fp8(global v4f* out, v2i a, v4i b, v4f c, 
int idx)
+{
+  *out = __builtin_amdgcn_smfmac_f32_16x16x64_bf8_fp8(a, b, c, idx, 0, 0);
+}
+
+// CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x64_fp8_bf8
+// CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.bf8(<2 
x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0)
+void test_smfmac_f32_16x16x64_fp8_bf8(global v4f* out, v2i a, v4i b, v4f c, 
int idx)
+{
+  *out = __builtin_amdgcn_smfmac_f32_16x16x64_fp8_bf8(a, b, c, idx, 0, 0);
+}
+
+// CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x64_fp8_fp8
+// CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.fp8(<2 
x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0)
+void test_smfmac_f32_16x16x64_fp8_fp8(global v4f* out, v2i a, v4i b, v4f c, 
int idx)
+{
+  *out = __builtin_amdgcn_smfmac_f32_16x16x64_fp8_fp8(a, b, c, idx, 0, 0);
+}
+
+// CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x32_bf8_bf8
+// CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.bf8(<2 
x i32> %a, <4 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0)
+void test_smfm

[clang] 27439a7 - [AMDGPU] New gfx940 mfma instructions

2022-03-24 Thread Stanislav Mekhanoshin via cfe-commits


Author: Stanislav Mekhanoshin
Date: 2022-03-24T12:12:52-07:00
New Revision: 27439a764230e5eb54568b2fc053a20c9005970f

URL: 
https://github.com/llvm/llvm-project/commit/27439a764230e5eb54568b2fc053a20c9005970f
DIFF: 
https://github.com/llvm/llvm-project/commit/27439a764230e5eb54568b2fc053a20c9005970f.diff

LOG: [AMDGPU] New gfx940 mfma instructions

Differential Revision: https://reviews.llvm.org/D122044

Added: 
clang/test/SemaOpenCL/builtins-amdgcn-error-gfx940-param.cl
llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.mfma.gfx940.mir
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx940.ll
llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select-gfx940.ll

Modified: 
clang/include/clang/Basic/BuiltinsAMDGPU.def
clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
llvm/include/llvm/IR/IntrinsicsAMDGPU.td
llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
llvm/lib/Target/AMDGPU/SIInstrInfo.td
llvm/lib/Target/AMDGPU/SISchedule.td
llvm/lib/Target/AMDGPU/VOP3PInstructions.td
llvm/test/MC/AMDGPU/mai-gfx940.s
llvm/test/MC/Disassembler/AMDGPU/mai-gfx940.txt

Removed: 




diff  --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index d2e60f85b9feb..3870b1cca6caa 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -305,5 +305,10 @@ TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x16bf16_1k, 
"V4fV4sV4sV4fIiIiIi",
 TARGET_BUILTIN(__builtin_amdgcn_mfma_f64_16x16x4f64, "V4dddV4dIiIiIi", "nc", 
"mai-insts")
 TARGET_BUILTIN(__builtin_amdgcn_mfma_f64_4x4x4f64, "IiIiIi", "nc", 
"mai-insts")
 
+TARGET_BUILTIN(__builtin_amdgcn_mfma_i32_16x16x32_i8, "V4iWiWiV4iIiIiIi", 
"nc", "mai-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_i32_32x32x16_i8, "V16iWiWiV16iIiIiIi", 
"nc", "mai-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x8_xf32, "V4fV2fV2fV4fIiIiIi", 
"nc", "mai-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x4_xf32, "V16fV2fV2fV16fIiIiIi", 
"nc", "mai-insts")
+
 #undef BUILTIN
 #undef TARGET_BUILTIN

diff  --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
index 19ac40fe41605..fc29faf9ad1c5 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
@@ -1,9 +1,11 @@
 // REQUIRES: amdgpu-registered-target
 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx908 
-DMFMA_GFX908_TESTS -S -emit-llvm -o - %s | FileCheck %s 
--check-prefix=CHECK-GFX908
 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx90a 
-DMFMA_GFX90A_TESTS -S -emit-llvm -o - %s | FileCheck %s 
--check-prefix=CHECK-GFX90A
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 
-DMFMA_GFX940_TESTS -S -emit-llvm -o - %s | FileCheck %s 
--check-prefix=CHECK-GFX940
 
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
 
+typedef float  v2f   __attribute__((ext_vector_type(2)));
 typedef float  v4f   __attribute__((ext_vector_type(4)));
 typedef float  v16f  __attribute__((ext_vector_type(16)));
 typedef float  v32f  __attribute__((ext_vector_type(32)));
@@ -216,3 +218,33 @@ void test_mfma_f64_4x4x4f64(global double* out, double a, 
double b, double c)
 }
 
 #endif // MFMA_GFX90A_TESTS
+
+#ifdef MFMA_GFX940_TESTS
+// CHECK-GFX940-LABEL: @test_mfma_i32_16x16x32_i8
+// CHECK-GFX940: call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64 %a, i64 
%b, <4 x i32> %c, i32 0, i32 0, i32 0)
+void test_mfma_i32_16x16x32_i8(global v4i* out, long a, long b, v4i c)
+{
+  *out = __builtin_amdgcn_mfma_i32_16x16x32_i8(a, b, c, 0, 0, 0);
+}
+
+// CHECK-GFX940-LABEL: @test_mfma_i32_32x32x16_i8
+// CHECK-GFX940: call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x16.i8(i64 %a, i64 
%b, <16 x i32> %c, i32 0, i32 0, i32 0)
+void test_mfma_i32_32x32x16_i8(global v16i* out, long a, long b, v16i c)
+{
+  *out = __builtin_amdgcn_mfma_i32_32x32x16_i8(a, b, c, 0, 0, 0);
+}
+
+// CHECK-GFX940-LABEL: @test_mfma_f32_16x16x8_xf32
+// CHECK-GFX940: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x 
float> %a, <2 x float> %b, <4 x float> %c, i32 0, i32 0, i32 0)
+void test_mfma_f32_16x16x8_xf32(global v4f* out, v2f a, v2f b, v4f c)
+{
+  *out = __builtin_amdgcn_mfma_f32_16x16x8_xf32(a, b, c, 0, 0, 0);
+}
+
+// CHECK-GFX940-LABEL: @test_mfma_f32_32x32x4_xf32
+// CHECK-GFX940: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x 
float> %a, <2 x float> %b, <16 x float> %c, i32 0, i32 0, i32 0)
+void test_mfma_f32_32x32x4_xf32(global v16f* out, v2f a, v2f b, v16f c)
+{
+  *out = __builtin_amdgcn_mfma_f32_32x32x4_xf32(a, b, c, 0, 0, 0);
+}
+#endif // MFMA_GFX940_TESTS

diff  --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx940-param.cl 
b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx940-param.cl
new file mode 100644
index 0..9e50a1

[clang] 6e3e14f - [AMDGPU] Support gfx940 smfmac instructions

2022-03-24 Thread Stanislav Mekhanoshin via cfe-commits


Author: Stanislav Mekhanoshin
Date: 2022-03-24T12:40:42-07:00
New Revision: 6e3e14f600afa1fa64a699df97c8bbac6d0f8b5a

URL: 
https://github.com/llvm/llvm-project/commit/6e3e14f600afa1fa64a699df97c8bbac6d0f8b5a
DIFF: 
https://github.com/llvm/llvm-project/commit/6e3e14f600afa1fa64a699df97c8bbac6d0f8b5a.diff

LOG: [AMDGPU] Support gfx940 smfmac instructions

Differential Revision: https://reviews.llvm.org/D122191

Added: 


Modified: 
clang/include/clang/Basic/BuiltinsAMDGPU.def
clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
clang/test/SemaOpenCL/builtins-amdgcn-error-gfx940-param.cl
llvm/include/llvm/IR/IntrinsicsAMDGPU.td
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
llvm/lib/Target/AMDGPU/SIInstrInfo.td
llvm/lib/Target/AMDGPU/SIRegisterInfo.td
llvm/lib/Target/AMDGPU/SISchedule.td
llvm/lib/Target/AMDGPU/VOP3Instructions.td
llvm/lib/Target/AMDGPU/VOP3PInstructions.td
llvm/lib/Target/AMDGPU/VOPInstructions.td
llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.mfma.gfx940.mir
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx940.ll
llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select-gfx940.ll
llvm/test/MC/AMDGPU/mai-gfx940.s
llvm/test/MC/Disassembler/AMDGPU/mai-gfx940.txt

Removed: 




diff  --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 3870b1cca6caa..afcfa07f6df13 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -309,6 +309,12 @@ TARGET_BUILTIN(__builtin_amdgcn_mfma_i32_16x16x32_i8, 
"V4iWiWiV4iIiIiIi", "nc",
 TARGET_BUILTIN(__builtin_amdgcn_mfma_i32_32x32x16_i8, "V16iWiWiV16iIiIiIi", 
"nc", "mai-insts")
 TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x8_xf32, "V4fV2fV2fV4fIiIiIi", 
"nc", "mai-insts")
 TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x4_xf32, "V16fV2fV2fV16fIiIiIi", 
"nc", "mai-insts")
+TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_16x16x32_f16, "V4fV4hV8hV4fiIiIi", 
"nc", "mai-insts")
+TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x16_f16, 
"V16fV4hV8hV16fiIiIi", "nc", "mai-insts")
+TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_16x16x32_bf16, "V4fV4sV8sV4fiIiIi", 
"nc", "mai-insts")
+TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x16_bf16, 
"V16fV4sV8sV16fiIiIi", "nc", "mai-insts")
+TARGET_BUILTIN(__builtin_amdgcn_smfmac_i32_16x16x64_i8, "V4iV2iV4iV4iiIiIi", 
"nc", "mai-insts")
+TARGET_BUILTIN(__builtin_amdgcn_smfmac_i32_32x32x32_i8, "V16iV2iV4iV16iiIiIi", 
"nc", "mai-insts")
 
 #undef BUILTIN
 #undef TARGET_BUILTIN

diff  --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
index fc29faf9ad1c5..8e3cc7e382e90 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
@@ -10,13 +10,16 @@ typedef float  v4f   __attribute__((ext_vector_type(4)));
 typedef float  v16f  __attribute__((ext_vector_type(16)));
 typedef float  v32f  __attribute__((ext_vector_type(32)));
 typedef half   v4h   __attribute__((ext_vector_type(4)));
+typedef half   v8h   __attribute__((ext_vector_type(8)));
 typedef half   v16h  __attribute__((ext_vector_type(16)));
 typedef half   v32h  __attribute__((ext_vector_type(32)));
+typedef intv2i   __attribute__((ext_vector_type(2)));
 typedef intv4i   __attribute__((ext_vector_type(4)));
 typedef intv16i  __attribute__((ext_vector_type(16)));
 typedef intv32i  __attribute__((ext_vector_type(32)));
 typedef short  v2s   __attribute__((ext_vector_type(2)));
 typedef short  v4s   __attribute__((ext_vector_type(4)));
+typedef short  v8s   __attribute__((ext_vector_type(8)));
 typedef short  v16s  __attribute__((ext_vector_type(16)));
 typedef short  v32s  __attribute__((ext_vector_type(32)));
 typedef double v4d   __attribute__((ext_vector_type(4)));
@@ -247,4 +250,46 @@ void test_mfma_f32_32x32x4_xf32(global v16f* out, v2f a, 
v2f b, v16f c)
 {
   *out = __builtin_amdgcn_mfma_f32_32x32x4_xf32(a, b, c, 0, 0, 0);
 }
+
+// CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x32_f16
+// CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.f16(<4 x 
half> %a, <8 x half> %b, <4 x float> %c, i32 %idx, i32 0, i32 0)
+void test_smfmac_f32_16x16x32_f16(global v4f* out, v4h a, v8h b, v4f c, int 
idx)
+{
+  *out = __builtin_amdgcn_smfmac_f32_16x16x32_f16(a, b, c, idx, 0, 0);
+}
+
+// CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x16_f16
+// CHECK-GFX940: call <16 x float>

[compiler-rt] [libcxxabi] [libcxx] [lld] [flang] [llvm] [clang-tools-extra] [lldb] [clang] [AMDGPU] GFX12: select @llvm.prefetch intrinsic (PR #74576)

2023-12-07 Thread Stanislav Mekhanoshin via cfe-commits



@@ -959,6 +967,32 @@ def : GCNPat <
 }
 } // let OtherPredicates = [HasShaderCyclesRegister]
 
+def SIMM24bitPtr : ImmLeaf (Imm);}]
+>;
+
+multiclass SMPrefetchPat {
+  def : GCNPat <
+(smrd_prefetch (SMRDImm i64:$sbase, i32:$offset), timm, timm, (i32 
cache_type)),
+(!cast("S_PREFETCH_"#type) $sbase, $offset, (i32 
SGPR_NULL), (i8 0))
+  >;
+
+  def : GCNPat <
+(smrd_prefetch (i64 SReg_64:$sbase), timm, timm, (i32 cache_type)),
+(!cast("S_PREFETCH_"#type) $sbase, 0, (i32 SGPR_NULL), 
(i8 0))
+  >;
+
+  def : GCNPat <
+(prefetch SIMM24bitPtr:$offset, timm, timm, (i32 cache_type)),
+(!cast("S_PREFETCH_"#type#"_PC_REL") (as_i32timm 
$offset), (i32 SGPR_NULL), (i8 0))
+  > {
+let AddedComplexity = 10;
+  }

rampitec wrote:

Prefetch on an absolute address is practically useless.

https://github.com/llvm/llvm-project/pull/74576
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang-tools-extra] [llvm] [libcxx] [lldb] [flang] [clang] [compiler-rt] [libcxxabi] [lld] [AMDGPU] GFX12: select @llvm.prefetch intrinsic (PR #74576)

2023-12-07 Thread Stanislav Mekhanoshin via cfe-commits



@@ -959,6 +967,32 @@ def : GCNPat <
 }
 } // let OtherPredicates = [HasShaderCyclesRegister]
 
+def SIMM24bitPtr : ImmLeaf (Imm);}]
+>;
+
+multiclass SMPrefetchPat {
+  def : GCNPat <
+(smrd_prefetch (SMRDImm i64:$sbase, i32:$offset), timm, timm, (i32 
cache_type)),
+(!cast("S_PREFETCH_"#type) $sbase, $offset, (i32 
SGPR_NULL), (i8 0))
+  >;
+
+  def : GCNPat <
+(smrd_prefetch (i64 SReg_64:$sbase), timm, timm, (i32 cache_type)),
+(!cast("S_PREFETCH_"#type) $sbase, 0, (i32 SGPR_NULL), 
(i8 0))
+  >;
+
+  def : GCNPat <
+(prefetch SIMM24bitPtr:$offset, timm, timm, (i32 cache_type)),
+(!cast("S_PREFETCH_"#type#"_PC_REL") (as_i32timm 
$offset), (i32 SGPR_NULL), (i8 0))
+  > {
+let AddedComplexity = 10;
+  }

rampitec wrote:

So you want a target intrinsic?

https://github.com/llvm/llvm-project/pull/74576
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[compiler-rt] [libcxx] [lldb] [libcxxabi] [clang-tools-extra] [lld] [llvm] [clang] [flang] [AMDGPU] GFX12: select @llvm.prefetch intrinsic (PR #74576)

2023-12-08 Thread Stanislav Mekhanoshin via cfe-commits



@@ -959,6 +967,32 @@ def : GCNPat <
 }
 } // let OtherPredicates = [HasShaderCyclesRegister]
 
+def SIMM24bitPtr : ImmLeaf (Imm);}]
+>;
+
+multiclass SMPrefetchPat {
+  def : GCNPat <
+(smrd_prefetch (SMRDImm i64:$sbase, i32:$offset), timm, timm, (i32 
cache_type)),
+(!cast("S_PREFETCH_"#type) $sbase, $offset, (i32 
SGPR_NULL), (i8 0))
+  >;
+
+  def : GCNPat <
+(smrd_prefetch (i64 SReg_64:$sbase), timm, timm, (i32 cache_type)),
+(!cast("S_PREFETCH_"#type) $sbase, 0, (i32 SGPR_NULL), 
(i8 0))
+  >;
+
+  def : GCNPat <
+(prefetch SIMM24bitPtr:$offset, timm, timm, (i32 cache_type)),
+(!cast("S_PREFETCH_"#type#"_PC_REL") (as_i32timm 
$offset), (i32 SGPR_NULL), (i8 0))
+  > {
+let AddedComplexity = 10;
+  }

rampitec wrote:

I do not think we need to use PC_REL form to prefetch on a function's address. 
The instruction can take full 64-bit address, so one can just use this address. 
 My understanding that PC_REL form can be useful if you expect something like a 
huge loop or a local branch and want to prefetch something like 1K from the PC. 
I am not sure though how useful this can be at a high language level or even in 
IR.

https://github.com/llvm/llvm-project/pull/74576
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[lldb] [lld] [llvm] [libc] [libcxx] [flang] [compiler-rt] [clang-tools-extra] [clang] [GlobalISel] Add G_PREFETCH (PR #74863)

2023-12-11 Thread Stanislav Mekhanoshin via cfe-commits

@@ -1209,6 +1209,15 @@ def G_FENCE : GenericInstruction {
   let hasSideEffects = true;
 }

+// Generic opcode equivalent to the llvm.prefetch intrinsic.
+def G_PREFETCH : GenericInstruction {
+  let OutOperandList = (outs);
+  let InOperandList = (ins ptype0:$address, i32imm:$rw, i32imm:$locality, 
i32imm:$cachetype);
+  let hasSideEffects = true;
+  let mayLoad = true;
+  let mayStore = true;

rampitec wrote:

> should probably just be hasSideEffects. mayLoad/mayStore imply it needs a 
> memory operand and is an ordered memory reference when it doesn't have one

I could argue this is not a memory operation at all as it shall have no visible 
effects other than access speed, although practically it has ordering. You 
certainly do not want a prefetch to be moved past the loads which it was 
supposed to prefetch. I.e. in my view use of both mayLoad and mayStore is 
justified. Although we need to make sure it is not considered an aliased store 
or load from the AA point of view.

https://github.com/llvm/llvm-project/pull/74863
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang-tools-extra] [clang] [llvm] [AMDGPU] Use alias info to relax waitcounts for LDS DMA (PR #74537)

2023-12-11 Thread Stanislav Mekhanoshin via cfe-commits


https://github.com/rampitec updated 
https://github.com/llvm/llvm-project/pull/74537

>From 7e382620cdc5999c645ed0746f242595f0294c58 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin 
Date: Mon, 4 Dec 2023 16:11:53 -0800
Subject: [PATCH 1/7] [AMDGPU] Use alias info to relax waitcounts for LDS DMA

LDA DMA loads increase VMCNT and a load from the LDS stored must
wait on this counter to only read memory after it is written.
Wait count insertion pass does not track memory dependencies, it
tracks register dependencies. To model the LDS dependency a
psuedo register is used in the scoreboard, acting like if LDS DMA
writes it and LDS load reads it.

This patch adds 8 more pseudo registers to use for independent LDS
locations if we can prove they are disjoint using alias analysis.

Fixes: SWDEV-433427
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp   |  16 +-
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp |  73 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp  |   4 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.h|   8 +
 llvm/lib/Target/AMDGPU/lds-dma-waits.ll | 154 
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll|   2 +
 6 files changed, 241 insertions(+), 16 deletions(-)
 create mode 100644 llvm/lib/Target/AMDGPU/lds-dma-waits.ll

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a7f4d63229b7ef..2e079404b087fa 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1128,11 +1128,10 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo 
&Info,
 MachineMemOperand::MOStore |
 MachineMemOperand::MODereferenceable;
 
-  // XXX - Should this be volatile without known ordering?
-  Info.flags |= MachineMemOperand::MOVolatile;
-
   switch (IntrID) {
   default:
+// XXX - Should this be volatile without known ordering?
+Info.flags |= MachineMemOperand::MOVolatile;
 break;
   case Intrinsic::amdgcn_raw_buffer_load_lds:
   case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
@@ -1140,6 +1139,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo 
&Info,
   case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
 unsigned Width = 
cast(CI.getArgOperand(2))->getZExtValue();
 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
+Info.ptrVal = CI.getArgOperand(1);
 return true;
   }
   }
@@ -1268,8 +1268,8 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo 
&Info,
 Info.opc = ISD::INTRINSIC_VOID;
 unsigned Width = cast(CI.getArgOperand(2))->getZExtValue();
 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
-Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
-  MachineMemOperand::MOVolatile;
+Info.ptrVal = CI.getArgOperand(1);
+Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
 return true;
   }
   case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
@@ -9084,7 +9084,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
 
 MachinePointerInfo StorePtrI = LoadPtrI;
-StorePtrI.V = nullptr;
+LoadPtrI.V = UndefValue::get(
+PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
+LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
 
 auto F = LoadMMO->getFlags() &
@@ -9162,6 +9164,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
 LoadPtrI.Offset = Op->getConstantOperandVal(5);
 MachinePointerInfo StorePtrI = LoadPtrI;
+LoadPtrI.V = UndefValue::get(
+PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
 LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
 auto F = LoadMMO->getFlags() &
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp 
b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index ede4841b8a5fd7..50ad22130e939e 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -31,6 +31,7 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/Sequence.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/InitializePasses.h"
@@ -121,8 +122,13 @@ enum RegisterMapping {
   SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets.
   AGPR_OFFSET = 256,  // Maximum programmable ArchVGPRs across all targets.
   SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
-  NUM_EXTRA_VGPRS = 1,// A reserved slot for DS.
-  EXTRA_VGPR_LDS = 0, // An artificial register to track LDS writes.
+  NUM_EXTRA_VGPRS = 9,// Reserved slots f

[llvm] [libc] [flang] [openmp] [clang] [mlir] [libcxx] [lldb] [clang-tools-extra] GlobalISel: Guard return in llvm::getIConstantSplatVal (PR #71989)

2023-11-14 Thread Stanislav Mekhanoshin via cfe-commits


https://github.com/rampitec approved this pull request.


https://github.com/llvm/llvm-project/pull/71989
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [AMDGPU] - Add clang builtins for tied WMMA intrinsics (PR #70669)

2023-11-09 Thread Stanislav Mekhanoshin via cfe-commits



@@ -292,13 +292,17 @@ 
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w32, "V8fV16hV16hV8f", "nc
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32, "V8fV16sV16sV8f", 
"nc", "gfx11-insts")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w32, 
"V16hV16hV16hV16hIb", "nc", "gfx11-insts")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32, 
"V16sV16sV16sV16sIb", "nc", "gfx11-insts")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32, 
"V16hV16hV16hV16hIb", "nc", "gfx11-insts")

rampitec wrote:

Need to add negative test for the last operand to always be a constant integer. 
We do it every time 'I' modifier is used.

https://github.com/llvm/llvm-project/pull/70669
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [AMDGPU] - Add clang builtins for tied WMMA intrinsics (PR #70669)

2023-11-10 Thread Stanislav Mekhanoshin via cfe-commits


https://github.com/rampitec approved this pull request.

LGTM

https://github.com/llvm/llvm-project/pull/70669
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang-tools-extra] [mlir] [libcxx] [llvm] [libc] [flang] [openmp] [clang] [lldb] GlobalISel: Guide return in llvm::getIConstantSplatVal (PR #71989)

2023-11-10 Thread Stanislav Mekhanoshin via cfe-commits


rampitec wrote:

Any tests?

https://github.com/llvm/llvm-project/pull/71989
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [AMDGPU] make v32i16/v32f16 legal (PR #70484)

2023-10-27 Thread Stanislav Mekhanoshin via cfe-commits


https://github.com/rampitec approved this pull request.


https://github.com/llvm/llvm-project/pull/70484
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [AMDGPU] Select 64-bit imm moves if can be encoded as 32 bit operand (PR #70395)

2023-10-30 Thread Stanislav Mekhanoshin via cfe-commits


https://github.com/rampitec closed 
https://github.com/llvm/llvm-project/pull/70395
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang-tools-extra] [AMDGPU] Select 64-bit imm moves if can be encoded as 32 bit operand (PR #70395)

2023-10-30 Thread Stanislav Mekhanoshin via cfe-commits


https://github.com/rampitec closed 
https://github.com/llvm/llvm-project/pull/70395
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [AMDGPU] - Add clang builtins for tied WMMA intrinsics (PR #70669)

2023-10-31 Thread Stanislav Mekhanoshin via cfe-commits


https://github.com/rampitec commented:

Also needed negative tests that gfx11-insts feature is required (using gfx1030 
target for example) and for the immediate arguments. See for example 
builtins-amdgcn-gfx11-err.cl and builtins-amdgcn-fp-atomics-gfx11-err.cl.

https://github.com/llvm/llvm-project/pull/70669
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[llvm] [clang] [AMDGPU] Add global_load_tr for GFX12 (PR #77772)

2024-01-11 Thread Stanislav Mekhanoshin via cfe-commits


https://github.com/rampitec approved this pull request.

LGTM

https://github.com/llvm/llvm-project/pull/2
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[llvm] [flang] [clang] [clang-tools-extra] [compiler-rt] [libc] [lldb] [lld] [libcxx] [AMDGPU] Use alias info to relax waitcounts for LDS DMA (PR #74537)

2024-01-11 Thread Stanislav Mekhanoshin via cfe-commits


rampitec wrote:

Ping

https://github.com/llvm/llvm-project/pull/74537
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[flang] [libcxx] [compiler-rt] [llvm] [libc] [lldb] [lld] [clang-tools-extra] [clang] [AMDGPU] Use alias info to relax waitcounts for LDS DMA (PR #74537)

2024-01-12 Thread Stanislav Mekhanoshin via cfe-commits


https://github.com/rampitec updated 
https://github.com/llvm/llvm-project/pull/74537

>From 7e382620cdc5999c645ed0746f242595f0294c58 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin 
Date: Mon, 4 Dec 2023 16:11:53 -0800
Subject: [PATCH 01/10] [AMDGPU] Use alias info to relax waitcounts for LDS DMA

LDA DMA loads increase VMCNT and a load from the LDS stored must
wait on this counter to only read memory after it is written.
Wait count insertion pass does not track memory dependencies, it
tracks register dependencies. To model the LDS dependency a
psuedo register is used in the scoreboard, acting like if LDS DMA
writes it and LDS load reads it.

This patch adds 8 more pseudo registers to use for independent LDS
locations if we can prove they are disjoint using alias analysis.

Fixes: SWDEV-433427
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp   |  16 +-
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp |  73 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp  |   4 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.h|   8 +
 llvm/lib/Target/AMDGPU/lds-dma-waits.ll | 154 
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll|   2 +
 6 files changed, 241 insertions(+), 16 deletions(-)
 create mode 100644 llvm/lib/Target/AMDGPU/lds-dma-waits.ll

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a7f4d63229b7ef..2e079404b087fa 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1128,11 +1128,10 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo 
&Info,
 MachineMemOperand::MOStore |
 MachineMemOperand::MODereferenceable;
 
-  // XXX - Should this be volatile without known ordering?
-  Info.flags |= MachineMemOperand::MOVolatile;
-
   switch (IntrID) {
   default:
+// XXX - Should this be volatile without known ordering?
+Info.flags |= MachineMemOperand::MOVolatile;
 break;
   case Intrinsic::amdgcn_raw_buffer_load_lds:
   case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
@@ -1140,6 +1139,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo 
&Info,
   case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
 unsigned Width = 
cast(CI.getArgOperand(2))->getZExtValue();
 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
+Info.ptrVal = CI.getArgOperand(1);
 return true;
   }
   }
@@ -1268,8 +1268,8 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo 
&Info,
 Info.opc = ISD::INTRINSIC_VOID;
 unsigned Width = cast(CI.getArgOperand(2))->getZExtValue();
 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
-Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
-  MachineMemOperand::MOVolatile;
+Info.ptrVal = CI.getArgOperand(1);
+Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
 return true;
   }
   case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
@@ -9084,7 +9084,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
 
 MachinePointerInfo StorePtrI = LoadPtrI;
-StorePtrI.V = nullptr;
+LoadPtrI.V = UndefValue::get(
+PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
+LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
 
 auto F = LoadMMO->getFlags() &
@@ -9162,6 +9164,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
 LoadPtrI.Offset = Op->getConstantOperandVal(5);
 MachinePointerInfo StorePtrI = LoadPtrI;
+LoadPtrI.V = UndefValue::get(
+PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
 LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
 auto F = LoadMMO->getFlags() &
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp 
b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index ede4841b8a5fd7..50ad22130e939e 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -31,6 +31,7 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/Sequence.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/InitializePasses.h"
@@ -121,8 +122,13 @@ enum RegisterMapping {
   SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets.
   AGPR_OFFSET = 256,  // Maximum programmable ArchVGPRs across all targets.
   SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
-  NUM_EXTRA_VGPRS = 1,// A reserved slot for DS.
-  EXTRA_VGPR_LDS = 0, // An artificial register to track LDS writes.
+  NUM_EXTRA_VGPRS = 9,// Reserved slots

[flang] [libcxx] [compiler-rt] [llvm] [libc] [lldb] [lld] [clang-tools-extra] [clang] [AMDGPU] Use alias info to relax waitcounts for LDS DMA (PR #74537)

2024-01-12 Thread Stanislav Mekhanoshin via cfe-commits



@@ -130,6 +130,8 @@
 ; GCN-O0-NEXT:MachineDominator Tree Construction
 ; GCN-O0-NEXT:Machine Natural Loop Construction
 ; GCN-O0-NEXT:MachinePostDominator Tree Construction
+; GCN-O0-NEXT:Basic Alias Analysis (stateless AA impl)
+; GCN-O0-NEXT:Function Alias Analysis Results

rampitec wrote:

If I just skip getAnalysis call it does not help since analysis is requested in 
the getAnalysisUsage. If I do not request it it is not available at any 
optlevel.  This is the benefit of the alternative 
https://github.com/llvm/llvm-project/pull/75974, it does not request the full 
analysis.

https://github.com/llvm/llvm-project/pull/74537
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[flang] [libcxx] [compiler-rt] [llvm] [libc] [lldb] [lld] [clang-tools-extra] [clang] [AMDGPU] Use alias info to relax waitcounts for LDS DMA (PR #74537)

2024-01-12 Thread Stanislav Mekhanoshin via cfe-commits



@@ -1183,9 +1228,21 @@ bool 
SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
 // No need to wait before load from VMEM to LDS.
 if (TII->mayWriteLDSThroughDMA(MI))
   continue;
-unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
+
 // VM_CNT is only relevant to vgpr or LDS.
-ScoreBrackets.determineWait(VM_CNT, RegNo, Wait);
+unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
+bool FoundAliasingStore = false;
+if (Ptr && Memop->getAAInfo() && Memop->getAAInfo().Scope) {

rampitec wrote:

I have reserved just 8 pseudo registers to track it. I do not want to fill it 
with unrelated stuff. I know that the only way AA will be able to handle this 
very specific situation is if there is scope info, otherwise there is no reason 
to waste a slot and compile time. If I do not enter this 'if' the pass will 
just do conservatively correct thing and wait for this memory regardless of 
aliasing or lack of it.

https://github.com/llvm/llvm-project/pull/74537
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang-tools-extra] [flang] [libc] [lldb] [compiler-rt] [lld] [llvm] [libcxx] [clang] [AMDGPU] Use alias info to relax waitcounts for LDS DMA (PR #74537)

2024-01-12 Thread Stanislav Mekhanoshin via cfe-commits



@@ -703,8 +713,37 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
 setRegScore(RegNo, T, CurrScore);
   }
 }
-if (Inst.mayStore() && (TII->isDS(Inst) || mayWriteLDSThroughDMA(Inst))) {
-  setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
+if (Inst.mayStore() &&
+(TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) {
+  // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
+  // written can be accessed. A load from LDS to VMEM does not need a wait.
+  unsigned Slot = 0;
+  for (const auto *MemOp : Inst.memoperands()) {
+if (!MemOp->isStore() ||
+MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
+  continue;
+// Comparing just AA info does not guarantee memoperands are equal

rampitec wrote:

Right, there is no PSV. I have mentioned PSV because you have earlier suggested 
to use it. For the real IR value: it is not helpful to compare it. The IR value 
is a GEP, and this GEP is always different. I.e. these values never compare 
equal. The rest of the IR is already gone and unavailable for the analysis. 
Even if it would be available this GEP will address kernel module LDS variable, 
a single huge LDS array, and will be useless again. In this case it will tell 
you any LDS operation aliases any other. Now during the module LDS lowering I 
am creating alias scope info specifically to disambiguate aliasing after the 
pass has squashed all LDS variables.

https://github.com/llvm/llvm-project/pull/74537
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [AMDGPU] Change the representation of double literals in operands (PR #68740)

2023-10-12 Thread Stanislav Mekhanoshin via cfe-commits


rampitec wrote:

> Hi @rampitec
> 
> With UBSan built binaries the MC/AMDGPU/literals.s testcase fails and 
> triggers UB like
> 
> ```
> 07:33:04 ../lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp:2246:59: runtime 
> error: left shift of negative value -54321
> 07:33:04 SUMMARY: UndefinedBehaviorSanitizer: undefined-behavior 
> ../lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp:2246:59 
> ```
> 
> It's this new shift that it complains on:
> 
> ```
> Val = AMDGPU::isSISrcFPOperand(InstDesc, OpNum) ? Val << 32 : Lo_32(Val);
> ```

I am happy to fix it, but I do not understand how a LEFT shift can be a UB? Low 
bits will be zero, right? High bits will be disposed. I really do not 
understand.

https://github.com/llvm/llvm-project/pull/68740
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [AMDGPU] Change the representation of double literals in operands (PR #68740)

2023-10-12 Thread Stanislav Mekhanoshin via cfe-commits


rampitec wrote:

> > Hi @rampitec
> > With UBSan built binaries the MC/AMDGPU/literals.s testcase fails and 
> > triggers UB like
> > ```
> > 07:33:04 ../lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp:2246:59: 
> > runtime error: left shift of negative value -54321
> > 07:33:04 SUMMARY: UndefinedBehaviorSanitizer: undefined-behavior 
> > ../lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp:2246:59 
> > ```
> > 
> > 
> > It's this new shift that it complains on:
> > ```
> > Val = AMDGPU::isSISrcFPOperand(InstDesc, OpNum) ? Val << 32 : 
> > Lo_32(Val);
> > ```
> 
> I am happy to fix it, but I do not understand how a LEFT shift can be a UB? 
> Low bits will be zero, right? High bits will be disposed. I really do not 
> understand.

Building ubsan now...

https://github.com/llvm/llvm-project/pull/68740
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [AMDGPU] Change the representation of double literals in operands (PR #68740)

2023-10-12 Thread Stanislav Mekhanoshin via cfe-commits


rampitec wrote:

I have a small little problem that I cannot build tip now:
```
FAILED: tools/llvm-remarkutil/CMakeFiles/llvm-remarkutil.dir/RemarkCounter.cpp.o
CCACHE_CPP2=yes CCACHE_HASHDIR=yes /usr/bin/ccache /usr/bin/clang++ 
-DGTEST_HAS_RTTI=0 -D_DEBUG -D_GLIBCXX_ASSERTIONS -D_GNU_SOURCE 
-D_LIBCPP_ENABLE_HARDENED_MODE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS 
-D__STDC_LIMIT_MACROS -I/home/stas/work/llvm/ubsan/tools/llvm-remarkutil 
-I/home/stas/work/llvm/llvm/tools/llvm-remarkutil 
-I/home/stas/work/llvm/ubsan/include -I/home/stas/work/llvm/llvm/include -fPIC 
-fvisibility-inlines-hidden -Werror=date-time 
-Werror=unguarded-availability-new -Wall -Wextra -Wno-unused-parameter 
-Wwrite-strings -Wcast-qual -Wmissing-field-initializers -pedantic 
-Wno-long-long -Wc++98-compat-extra-semi -Wimplicit-fallthrough 
-Wcovered-switch-default -Wno-noexcept-type -Wnon-virtual-dtor 
-Wdelete-non-virtual-dtor -Wstring-conversion -Wmisleading-indentation 
-Wctad-maybe-unsupported -fdiagnostics-color -g  -fno-exceptions 
-funwind-tables -fno-rtti -std=c++17 -MD -MT 
tools/llvm-remarkutil/CMakeFiles/llvm-remarkutil.dir/RemarkCounter.cpp.o -MF 
tools/llvm-remarkutil/CMakeFiles/llvm-remarkutil.dir/RemarkCounter.cpp.o.d -o 
tools/llvm-remarkutil/CMakeFiles/llvm-remarkutil.dir/RemarkCounter.cpp.o -c 
/home/stas/work/llvm/llvm/tools/llvm-remarkutil/RemarkCounter.cpp
In file included from 
/home/stas/work/llvm/llvm/tools/llvm-remarkutil/RemarkCounter.cpp:13:
/home/stas/work/llvm/llvm/tools/llvm-remarkutil/RemarkCounter.h:90:14: error: 
call to deleted constructor of 'llvm::Error'
  return E;
 ^
```

https://github.com/llvm/llvm-project/pull/68740
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [AMDGPU] Change the representation of double literals in operands (PR #68740)

2023-10-12 Thread Stanislav Mekhanoshin via cfe-commits


rampitec wrote:

> I suppose left shift of negative values is undefined because if you shift out 
> the sign bit you can overflow and get a positive value.

Sounds like BS. It is defined. Unexpected maybe.

https://github.com/llvm/llvm-project/pull/68740
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [AMDGPU] Change the representation of double literals in operands (PR #68740)

2023-10-13 Thread Stanislav Mekhanoshin via cfe-commits


rampitec wrote:

> I suppose left shift of negative values is undefined because if you shift out 
> the sign bit you can overflow and get a positive value.

https://github.com/llvm/llvm-project/pull/68959

https://github.com/llvm/llvm-project/pull/68740
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang-tools-extra] [AMDGPU] Make S_MOV_B64_IMM_PSEUDO foldable (PR #69483)

2023-10-18 Thread Stanislav Mekhanoshin via cfe-commits


https://github.com/rampitec updated 
https://github.com/llvm/llvm-project/pull/69483

>From 27ab57359ea876c0ce78e42d8ab1ffc47348efb1 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin 
Date: Wed, 18 Oct 2023 09:50:44 -0700
Subject: [PATCH] [AMDGPU] Make S_MOV_B64_IMM_PSEUDO foldable

With the legality checks in place it is now safe to do. S_MOV_B64
shall not be used with wide literals, thus updating the test.
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp|  1 +
 .../AMDGPU/fold-short-64-bit-literals.mir | 23 +--
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp 
b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d6733bfa058acee..4ff7b462f0f3295 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3170,6 +3170,7 @@ bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) {
   case AMDGPU::V_MOV_B64_e64:
   case AMDGPU::S_MOV_B32:
   case AMDGPU::S_MOV_B64:
+  case AMDGPU::S_MOV_B64_IMM_PSEUDO:
   case AMDGPU::COPY:
   case AMDGPU::WWM_COPY:
   case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
diff --git a/llvm/test/CodeGen/AMDGPU/fold-short-64-bit-literals.mir 
b/llvm/test/CodeGen/AMDGPU/fold-short-64-bit-literals.mir
index 328ee991da8f4a6..6e975c8a5370758 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-short-64-bit-literals.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-short-64-bit-literals.mir
@@ -9,11 +9,11 @@ body: |
 
 ; GCN-LABEL: name: no_fold_fp_64bit_literal_sgpr
 ; GCN: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1311768467750121200
-; GCN-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64 = V_ADD_F64_e64 0, 
[[S_MOV_B64_]], 0, [[DEF]], 0, 0, implicit $mode, implicit $exec
+; GCN-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 
1311768467750121200
+; GCN-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64 = V_ADD_F64_e64 0, 
[[S_MOV_B]], 0, [[DEF]], 0, 0, implicit $mode, implicit $exec
 ; GCN-NEXT: SI_RETURN_TO_EPILOG [[V_ADD_F64_e64_]]
 %0:vreg_64 = IMPLICIT_DEF
-%1:sreg_64 = S_MOV_B64 1311768467750121200
+%1:sreg_64 = S_MOV_B64_IMM_PSEUDO 1311768467750121200
 %2:vreg_64 = V_ADD_F64_e64 0, %1, 0, %0, 0, 0, implicit $mode, implicit 
$exec
 SI_RETURN_TO_EPILOG %2
 ...
@@ -46,7 +46,7 @@ body: |
 ; GCN-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64 = V_ADD_F64_e64 0, 
4636737291354636288, 0, [[DEF]], 0, 0, implicit $mode, implicit $exec
 ; GCN-NEXT: SI_RETURN_TO_EPILOG [[V_ADD_F64_e64_]]
 %0:vreg_64 = IMPLICIT_DEF
-%1:sreg_64 = S_MOV_B64 4636737291354636288
+%1:sreg_64 = S_MOV_B64_IMM_PSEUDO 4636737291354636288
 %2:vreg_64 = V_ADD_F64_e64 0, %1, 0, %0, 0, 0, implicit $mode, implicit 
$exec
 SI_RETURN_TO_EPILOG %2
 ...
@@ -59,11 +59,11 @@ body: |
 
 ; GCN-LABEL: name: no_fold_int_64bit_literal_sgpr
 ; GCN: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1311768467750121200
-; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[DEF]], 
[[S_MOV_B64_]], implicit-def $scc
+; GCN-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 
1311768467750121200
+; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[DEF]], 
[[S_MOV_B]], implicit-def $scc
 ; GCN-NEXT: SI_RETURN_TO_EPILOG [[S_AND_B64_]]
 %0:sreg_64 = IMPLICIT_DEF
-%1:sreg_64 = S_MOV_B64 1311768467750121200
+%1:sreg_64 = S_MOV_B64_IMM_PSEUDO 1311768467750121200
 %2:sreg_64 = S_AND_B64 %0, %1, implicit-def $scc
 SI_RETURN_TO_EPILOG %2
 ...
@@ -106,6 +106,11 @@ tracksRegLiveness: true
 body: |
   bb.0:
 
+; GCN-LABEL: name: no_fold_v2fp_64bit_literal_sgpr
+; GCN: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+; GCN-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 
4629700418019000320, implicit $exec
+; GCN-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64 = V_PK_ADD_F32 0, [[DEF]], 
0, [[V_MOV_B]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+; GCN-NEXT: SI_RETURN_TO_EPILOG [[V_PK_ADD_F32_]]
 %0:vreg_64 = IMPLICIT_DEF
 %1:vreg_64 = V_MOV_B64_PSEUDO 4629700418019000320, implicit $exec
 %2:vreg_64 = V_PK_ADD_F32 0, %0, 0, %1, 0, 0, 0, 0, 0, implicit $mode, 
implicit $exec
@@ -118,6 +123,10 @@ tracksRegLiveness: true
 body: |
   bb.0:
 
+; GCN-LABEL: name: fold_v2fp_32bit_literal_sgpr
+; GCN: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+; GCN-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64 = V_PK_ADD_F32 0, [[DEF]], 
0, 1065353216, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+; GCN-NEXT: SI_RETURN_TO_EPILOG [[V_PK_ADD_F32_]]
 %0:vreg_64 = IMPLICIT_DEF
 %1:vreg_64 = V_MOV_B64_PSEUDO 1065353216, implicit $exec
 %2:vreg_64 = V_PK_ADD_F32 0, %0, 0, %1, 0, 0, 0, 0, 0, implicit $mode, 
implicit $exec

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commi

[clang] [AMDGPU] Make S_MOV_B64_IMM_PSEUDO foldable (PR #69483)

2023-10-18 Thread Stanislav Mekhanoshin via cfe-commits


https://github.com/rampitec updated 
https://github.com/llvm/llvm-project/pull/69483

>From 27ab57359ea876c0ce78e42d8ab1ffc47348efb1 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin 
Date: Wed, 18 Oct 2023 09:50:44 -0700
Subject: [PATCH] [AMDGPU] Make S_MOV_B64_IMM_PSEUDO foldable

With the legality checks in place it is now safe to do. S_MOV_B64
shall not be used with wide literals, thus updating the test.
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp|  1 +
 .../AMDGPU/fold-short-64-bit-literals.mir | 23 +--
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp 
b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d6733bfa058acee..4ff7b462f0f3295 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3170,6 +3170,7 @@ bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) {
   case AMDGPU::V_MOV_B64_e64:
   case AMDGPU::S_MOV_B32:
   case AMDGPU::S_MOV_B64:
+  case AMDGPU::S_MOV_B64_IMM_PSEUDO:
   case AMDGPU::COPY:
   case AMDGPU::WWM_COPY:
   case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
diff --git a/llvm/test/CodeGen/AMDGPU/fold-short-64-bit-literals.mir 
b/llvm/test/CodeGen/AMDGPU/fold-short-64-bit-literals.mir
index 328ee991da8f4a6..6e975c8a5370758 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-short-64-bit-literals.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-short-64-bit-literals.mir
@@ -9,11 +9,11 @@ body: |
 
 ; GCN-LABEL: name: no_fold_fp_64bit_literal_sgpr
 ; GCN: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1311768467750121200
-; GCN-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64 = V_ADD_F64_e64 0, 
[[S_MOV_B64_]], 0, [[DEF]], 0, 0, implicit $mode, implicit $exec
+; GCN-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 
1311768467750121200
+; GCN-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64 = V_ADD_F64_e64 0, 
[[S_MOV_B]], 0, [[DEF]], 0, 0, implicit $mode, implicit $exec
 ; GCN-NEXT: SI_RETURN_TO_EPILOG [[V_ADD_F64_e64_]]
 %0:vreg_64 = IMPLICIT_DEF
-%1:sreg_64 = S_MOV_B64 1311768467750121200
+%1:sreg_64 = S_MOV_B64_IMM_PSEUDO 1311768467750121200
 %2:vreg_64 = V_ADD_F64_e64 0, %1, 0, %0, 0, 0, implicit $mode, implicit 
$exec
 SI_RETURN_TO_EPILOG %2
 ...
@@ -46,7 +46,7 @@ body: |
 ; GCN-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64 = V_ADD_F64_e64 0, 
4636737291354636288, 0, [[DEF]], 0, 0, implicit $mode, implicit $exec
 ; GCN-NEXT: SI_RETURN_TO_EPILOG [[V_ADD_F64_e64_]]
 %0:vreg_64 = IMPLICIT_DEF
-%1:sreg_64 = S_MOV_B64 4636737291354636288
+%1:sreg_64 = S_MOV_B64_IMM_PSEUDO 4636737291354636288
 %2:vreg_64 = V_ADD_F64_e64 0, %1, 0, %0, 0, 0, implicit $mode, implicit 
$exec
 SI_RETURN_TO_EPILOG %2
 ...
@@ -59,11 +59,11 @@ body: |
 
 ; GCN-LABEL: name: no_fold_int_64bit_literal_sgpr
 ; GCN: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1311768467750121200
-; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[DEF]], 
[[S_MOV_B64_]], implicit-def $scc
+; GCN-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 
1311768467750121200
+; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[DEF]], 
[[S_MOV_B]], implicit-def $scc
 ; GCN-NEXT: SI_RETURN_TO_EPILOG [[S_AND_B64_]]
 %0:sreg_64 = IMPLICIT_DEF
-%1:sreg_64 = S_MOV_B64 1311768467750121200
+%1:sreg_64 = S_MOV_B64_IMM_PSEUDO 1311768467750121200
 %2:sreg_64 = S_AND_B64 %0, %1, implicit-def $scc
 SI_RETURN_TO_EPILOG %2
 ...
@@ -106,6 +106,11 @@ tracksRegLiveness: true
 body: |
   bb.0:
 
+; GCN-LABEL: name: no_fold_v2fp_64bit_literal_sgpr
+; GCN: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+; GCN-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 
4629700418019000320, implicit $exec
+; GCN-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64 = V_PK_ADD_F32 0, [[DEF]], 
0, [[V_MOV_B]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+; GCN-NEXT: SI_RETURN_TO_EPILOG [[V_PK_ADD_F32_]]
 %0:vreg_64 = IMPLICIT_DEF
 %1:vreg_64 = V_MOV_B64_PSEUDO 4629700418019000320, implicit $exec
 %2:vreg_64 = V_PK_ADD_F32 0, %0, 0, %1, 0, 0, 0, 0, 0, implicit $mode, 
implicit $exec
@@ -118,6 +123,10 @@ tracksRegLiveness: true
 body: |
   bb.0:
 
+; GCN-LABEL: name: fold_v2fp_32bit_literal_sgpr
+; GCN: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+; GCN-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64 = V_PK_ADD_F32 0, [[DEF]], 
0, 1065353216, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+; GCN-NEXT: SI_RETURN_TO_EPILOG [[V_PK_ADD_F32_]]
 %0:vreg_64 = IMPLICIT_DEF
 %1:vreg_64 = V_MOV_B64_PSEUDO 1065353216, implicit $exec
 %2:vreg_64 = V_PK_ADD_F32 0, %0, 0, %1, 0, 0, 0, 0, 0, implicit $mode, 
implicit $exec

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commi

[clang] [AMDGPU] Make S_MOV_B64_IMM_PSEUDO foldable (PR #69483)

2023-10-18 Thread Stanislav Mekhanoshin via cfe-commits


https://github.com/rampitec closed 
https://github.com/llvm/llvm-project/pull/69483
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang-tools-extra] [AMDGPU] Make S_MOV_B64_IMM_PSEUDO foldable (PR #69483)

2023-10-18 Thread Stanislav Mekhanoshin via cfe-commits


https://github.com/rampitec closed 
https://github.com/llvm/llvm-project/pull/69483
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [clang-tools-extra] [AMDGPU] Fix folding of v2i16/v2f16 splat imms (PR #72709)

2023-11-28 Thread Stanislav Mekhanoshin via cfe-commits


https://github.com/rampitec updated 
https://github.com/llvm/llvm-project/pull/72709

>From 423a0d1d4640680c5db3382ca0652fe85051ad8d Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin 
Date: Fri, 17 Nov 2023 10:52:13 -0800
Subject: [PATCH] [AMDGPU] Fix folding of v2i16/v2f16 splat imms

We can use inline constants with packed 16-bit operands, but these
should use op_sel. Currently splat of inlinable constants is
considered legal, which is not really true if we fail to fold it
with op_sel and drop the high half. It may be legal as a literal
but not as inline constant, but then usual literal checks must
be performed.

This patch makes these splat literals illegal but adds additional
logic to the operand folding to keep current folds. This logic
is somewhat heavy though.

This has fixed two bugs: constant bus violation in the fdot2 test
and invalid selection of inline constant 1 without op_sel in the
udot2 test.
---
 llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 135 +++---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp|  15 +-
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp|  10 ++
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |   3 +
 .../AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll |  29 ++--
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot2.ll |   4 +-
 6 files changed, 128 insertions(+), 68 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp 
b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 0ec0370e21dfc16..709de612d81d4a1 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -80,6 +80,10 @@ class SIFoldOperands : public MachineFunctionPass {
 
   bool updateOperand(FoldCandidate &Fold) const;
 
+  bool canUseImmWithOpSel(FoldCandidate &Fold) const;
+
+  bool tryFoldImmWithOpSel(FoldCandidate &Fold) const;
+
   bool tryAddToFoldList(SmallVectorImpl &FoldList,
 MachineInstr *MI, unsigned OpNo,
 MachineOperand *OpToFold) const;
@@ -196,60 +200,85 @@ FunctionPass *llvm::createSIFoldOperandsPass() {
   return new SIFoldOperands();
 }
 
-bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const {
+bool SIFoldOperands::canUseImmWithOpSel(FoldCandidate &Fold) const {
   MachineInstr *MI = Fold.UseMI;
   MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
-  assert(Old.isReg());
+  const uint64_t TSFlags = MI->getDesc().TSFlags;
 
+  assert(Old.isReg() && Fold.isImm());
 
-  const uint64_t TSFlags = MI->getDesc().TSFlags;
-  if (Fold.isImm()) {
-if (TSFlags & SIInstrFlags::IsPacked && !(TSFlags & SIInstrFlags::IsMAI) &&
-(!ST->hasDOTOpSelHazard() || !(TSFlags & SIInstrFlags::IsDOT)) &&
-AMDGPU::isFoldableLiteralV216(Fold.ImmToFold,
-  ST->hasInv2PiInlineImm())) {
-  // Set op_sel/op_sel_hi on this operand or bail out if op_sel is
-  // already set.
-  unsigned Opcode = MI->getOpcode();
-  int OpNo = MI->getOperandNo(&Old);
-  int ModIdx = -1;
-  if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0))
-ModIdx = AMDGPU::OpName::src0_modifiers;
-  else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, 
AMDGPU::OpName::src1))
-ModIdx = AMDGPU::OpName::src1_modifiers;
-  else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, 
AMDGPU::OpName::src2))
-ModIdx = AMDGPU::OpName::src2_modifiers;
-  assert(ModIdx != -1);
-  ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx);
-  MachineOperand &Mod = MI->getOperand(ModIdx);
-  unsigned Val = Mod.getImm();
-  if (!(Val & SISrcMods::OP_SEL_0) && (Val & SISrcMods::OP_SEL_1)) {
-// Only apply the following transformation if that operand requires
-// a packed immediate.
-switch (TII->get(Opcode).operands()[OpNo].OperandType) {
-case AMDGPU::OPERAND_REG_IMM_V2FP16:
-case AMDGPU::OPERAND_REG_IMM_V2INT16:
-case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
-case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
-  // If upper part is all zero we do not need op_sel_hi.
-  if (!isUInt<16>(Fold.ImmToFold)) {
-if (!(Fold.ImmToFold & 0x)) {
-  Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0);
-  Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
-  Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0x);
-  return true;
-}
-Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
-Old.ChangeToImmediate(Fold.ImmToFold & 0x);
-return true;
-  }
-  break;
-default:
-  break;
-}
-  }
-}
+  if (!(TSFlags & SIInstrFlags::IsPacked) || (TSFlags & SIInstrFlags::IsMAI) ||
+  (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)) ||
+  isUInt<16>(Fold.ImmToFold) ||
+  !AMDGPU::isFoldableLiteralV216(Fold.ImmToFold, ST->hasInv2PiInlineImm()))
+return false;
+
+  unsigned Opcode = MI->getOpcode();
+  int OpNo = MI-

[clang] [llvm] [clang-tools-extra] [AMDGPU] Fix folding of v2i16/v2f16 splat imms (PR #72709)

2023-11-28 Thread Stanislav Mekhanoshin via cfe-commits


https://github.com/rampitec edited 
https://github.com/llvm/llvm-project/pull/72709
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[llvm] [clang] [clang-tools-extra] [AMDGPU] Fix folding of v2i16/v2f16 splat imms (PR #72709)

2023-11-28 Thread Stanislav Mekhanoshin via cfe-commits


https://github.com/rampitec closed 
https://github.com/llvm/llvm-project/pull/72709
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [clang-tools-extra] [AMDGPU] Fix folding of v2i16/v2f16 splat imms (PR #72709)

2023-11-28 Thread Stanislav Mekhanoshin via cfe-commits


rampitec wrote:

After some digging I believe with this bug fixed we are fine now. Since we are 
passing all bf16 inputs as i16 we can only inline small integers, and inline 
integer 1 shall be the same as using 1 in an input register I believe. Although 
we are missing a potential optimization, say we could fold 'i16 0x3f80' as 
inline constant 1.0, and a pair of these as 1.0 with opsel should we know this 
is really a bf16 operand.

https://github.com/llvm/llvm-project/pull/72709
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[libcxx] [flang] [clang] [compiler-rt] [lld] [clang-tools-extra] [llvm] [lldb] [AMDGPU] Use alias info to relax waitcounts for LDS DMA (PR #74537)

2023-12-12 Thread Stanislav Mekhanoshin via cfe-commits


https://github.com/rampitec updated 
https://github.com/llvm/llvm-project/pull/74537

>From 7e382620cdc5999c645ed0746f242595f0294c58 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin 
Date: Mon, 4 Dec 2023 16:11:53 -0800
Subject: [PATCH 1/7] [AMDGPU] Use alias info to relax waitcounts for LDS DMA

LDA DMA loads increase VMCNT and a load from the LDS stored must
wait on this counter to only read memory after it is written.
Wait count insertion pass does not track memory dependencies, it
tracks register dependencies. To model the LDS dependency a
psuedo register is used in the scoreboard, acting like if LDS DMA
writes it and LDS load reads it.

This patch adds 8 more pseudo registers to use for independent LDS
locations if we can prove they are disjoint using alias analysis.

Fixes: SWDEV-433427
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp   |  16 +-
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp |  73 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp  |   4 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.h|   8 +
 llvm/lib/Target/AMDGPU/lds-dma-waits.ll | 154 
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll|   2 +
 6 files changed, 241 insertions(+), 16 deletions(-)
 create mode 100644 llvm/lib/Target/AMDGPU/lds-dma-waits.ll

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a7f4d63229b7ef..2e079404b087fa 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1128,11 +1128,10 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo 
&Info,
 MachineMemOperand::MOStore |
 MachineMemOperand::MODereferenceable;
 
-  // XXX - Should this be volatile without known ordering?
-  Info.flags |= MachineMemOperand::MOVolatile;
-
   switch (IntrID) {
   default:
+// XXX - Should this be volatile without known ordering?
+Info.flags |= MachineMemOperand::MOVolatile;
 break;
   case Intrinsic::amdgcn_raw_buffer_load_lds:
   case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
@@ -1140,6 +1139,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo 
&Info,
   case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
 unsigned Width = 
cast(CI.getArgOperand(2))->getZExtValue();
 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
+Info.ptrVal = CI.getArgOperand(1);
 return true;
   }
   }
@@ -1268,8 +1268,8 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo 
&Info,
 Info.opc = ISD::INTRINSIC_VOID;
 unsigned Width = cast(CI.getArgOperand(2))->getZExtValue();
 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
-Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
-  MachineMemOperand::MOVolatile;
+Info.ptrVal = CI.getArgOperand(1);
+Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
 return true;
   }
   case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
@@ -9084,7 +9084,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
 
 MachinePointerInfo StorePtrI = LoadPtrI;
-StorePtrI.V = nullptr;
+LoadPtrI.V = UndefValue::get(
+PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
+LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
 
 auto F = LoadMMO->getFlags() &
@@ -9162,6 +9164,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
 LoadPtrI.Offset = Op->getConstantOperandVal(5);
 MachinePointerInfo StorePtrI = LoadPtrI;
+LoadPtrI.V = UndefValue::get(
+PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
 LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
 auto F = LoadMMO->getFlags() &
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp 
b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index ede4841b8a5fd7..50ad22130e939e 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -31,6 +31,7 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/Sequence.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/InitializePasses.h"
@@ -121,8 +122,13 @@ enum RegisterMapping {
   SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets.
   AGPR_OFFSET = 256,  // Maximum programmable ArchVGPRs across all targets.
   SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
-  NUM_EXTRA_VGPRS = 1,// A reserved slot for DS.
-  EXTRA_VGPR_LDS = 0, // An artificial register to track LDS writes.
+  NUM_EXTRA_VGPRS = 9,// Reserved slots f

[libcxx] [flang] [clang] [compiler-rt] [lld] [clang-tools-extra] [llvm] [lldb] [AMDGPU] Use alias info to relax waitcounts for LDS DMA (PR #74537)

2023-12-12 Thread Stanislav Mekhanoshin via cfe-commits


rampitec wrote:

Ping

https://github.com/llvm/llvm-project/pull/74537
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[flang] [llvm] [libcxx] [compiler-rt] [lld] [clang-tools-extra] [libc] [clang] [lldb] [AMDGPU] Use alias info to relax waitcounts for LDS DMA (PR #74537)

2023-12-12 Thread Stanislav Mekhanoshin via cfe-commits


https://github.com/rampitec updated 
https://github.com/llvm/llvm-project/pull/74537

>From 7e382620cdc5999c645ed0746f242595f0294c58 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin 
Date: Mon, 4 Dec 2023 16:11:53 -0800
Subject: [PATCH 1/7] [AMDGPU] Use alias info to relax waitcounts for LDS DMA

LDA DMA loads increase VMCNT and a load from the LDS stored must
wait on this counter to only read memory after it is written.
Wait count insertion pass does not track memory dependencies, it
tracks register dependencies. To model the LDS dependency a
psuedo register is used in the scoreboard, acting like if LDS DMA
writes it and LDS load reads it.

This patch adds 8 more pseudo registers to use for independent LDS
locations if we can prove they are disjoint using alias analysis.

Fixes: SWDEV-433427
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp   |  16 +-
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp |  73 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp  |   4 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.h|   8 +
 llvm/lib/Target/AMDGPU/lds-dma-waits.ll | 154 
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll|   2 +
 6 files changed, 241 insertions(+), 16 deletions(-)
 create mode 100644 llvm/lib/Target/AMDGPU/lds-dma-waits.ll

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a7f4d63229b7ef..2e079404b087fa 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1128,11 +1128,10 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo 
&Info,
 MachineMemOperand::MOStore |
 MachineMemOperand::MODereferenceable;
 
-  // XXX - Should this be volatile without known ordering?
-  Info.flags |= MachineMemOperand::MOVolatile;
-
   switch (IntrID) {
   default:
+// XXX - Should this be volatile without known ordering?
+Info.flags |= MachineMemOperand::MOVolatile;
 break;
   case Intrinsic::amdgcn_raw_buffer_load_lds:
   case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
@@ -1140,6 +1139,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo 
&Info,
   case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
 unsigned Width = 
cast(CI.getArgOperand(2))->getZExtValue();
 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
+Info.ptrVal = CI.getArgOperand(1);
 return true;
   }
   }
@@ -1268,8 +1268,8 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo 
&Info,
 Info.opc = ISD::INTRINSIC_VOID;
 unsigned Width = cast(CI.getArgOperand(2))->getZExtValue();
 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
-Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
-  MachineMemOperand::MOVolatile;
+Info.ptrVal = CI.getArgOperand(1);
+Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
 return true;
   }
   case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
@@ -9084,7 +9084,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
 
 MachinePointerInfo StorePtrI = LoadPtrI;
-StorePtrI.V = nullptr;
+LoadPtrI.V = UndefValue::get(
+PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
+LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
 
 auto F = LoadMMO->getFlags() &
@@ -9162,6 +9164,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
 LoadPtrI.Offset = Op->getConstantOperandVal(5);
 MachinePointerInfo StorePtrI = LoadPtrI;
+LoadPtrI.V = UndefValue::get(
+PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
 LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
 auto F = LoadMMO->getFlags() &
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp 
b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index ede4841b8a5fd7..50ad22130e939e 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -31,6 +31,7 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/Sequence.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/InitializePasses.h"
@@ -121,8 +122,13 @@ enum RegisterMapping {
   SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets.
   AGPR_OFFSET = 256,  // Maximum programmable ArchVGPRs across all targets.
   SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
-  NUM_EXTRA_VGPRS = 1,// A reserved slot for DS.
-  EXTRA_VGPR_LDS = 0, // An artificial register to track LDS writes.
+  NUM_EXTRA_VGPRS = 9,// Reserved slots f

[lld] [compiler-rt] [clang] [flang] [lldb] [libc] [libcxx] [clang-tools-extra] [llvm] [AMDGPU] Use alias info to relax waitcounts for LDS DMA (PR #74537)

2023-12-12 Thread Stanislav Mekhanoshin via cfe-commits


rampitec wrote:

To make it easier I am splitting the patch. I have pre-comitted the test, and 
there is a part which fixes lack of wait on GFX10 : 
https://github.com/llvm/llvm-project/pull/75245

https://github.com/llvm/llvm-project/pull/74537
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[lld] [compiler-rt] [clang] [flang] [lldb] [libc] [libcxx] [clang-tools-extra] [llvm] [AMDGPU] Use alias info to relax waitcounts for LDS DMA (PR #74537)

2023-12-12 Thread Stanislav Mekhanoshin via cfe-commits


rampitec wrote:

Another part is improving memoperand info: 
https://github.com/llvm/llvm-project/pull/75247. This is NFCI just by itself.

https://github.com/llvm/llvm-project/pull/74537
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[lld] [compiler-rt] [clang] [flang] [lldb] [libc] [libcxx] [clang-tools-extra] [llvm] [AMDGPU] Use alias info to relax waitcounts for LDS DMA (PR #74537)

2023-12-12 Thread Stanislav Mekhanoshin via cfe-commits


rampitec wrote:

Yet another part to fix disjoint memory checks with LDS DMA: 
https://github.com/llvm/llvm-project/pull/75249

https://github.com/llvm/llvm-project/pull/74537
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [lld] [clang-tools-extra] [compiler-rt] [lldb] [flang] [llvm] [libcxx] [libc] [AMDGPU] Use alias info to relax waitcounts for LDS DMA (PR #74537)

2023-12-13 Thread Stanislav Mekhanoshin via cfe-commits


https://github.com/rampitec updated 
https://github.com/llvm/llvm-project/pull/74537

>From 7e382620cdc5999c645ed0746f242595f0294c58 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin 
Date: Mon, 4 Dec 2023 16:11:53 -0800
Subject: [PATCH 1/7] [AMDGPU] Use alias info to relax waitcounts for LDS DMA

LDA DMA loads increase VMCNT and a load from the LDS stored must
wait on this counter to only read memory after it is written.
Wait count insertion pass does not track memory dependencies, it
tracks register dependencies. To model the LDS dependency a
psuedo register is used in the scoreboard, acting like if LDS DMA
writes it and LDS load reads it.

This patch adds 8 more pseudo registers to use for independent LDS
locations if we can prove they are disjoint using alias analysis.

Fixes: SWDEV-433427
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp   |  16 +-
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp |  73 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp  |   4 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.h|   8 +
 llvm/lib/Target/AMDGPU/lds-dma-waits.ll | 154 
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll|   2 +
 6 files changed, 241 insertions(+), 16 deletions(-)
 create mode 100644 llvm/lib/Target/AMDGPU/lds-dma-waits.ll

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a7f4d63229b7ef..2e079404b087fa 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1128,11 +1128,10 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo 
&Info,
 MachineMemOperand::MOStore |
 MachineMemOperand::MODereferenceable;
 
-  // XXX - Should this be volatile without known ordering?
-  Info.flags |= MachineMemOperand::MOVolatile;
-
   switch (IntrID) {
   default:
+// XXX - Should this be volatile without known ordering?
+Info.flags |= MachineMemOperand::MOVolatile;
 break;
   case Intrinsic::amdgcn_raw_buffer_load_lds:
   case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
@@ -1140,6 +1139,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo 
&Info,
   case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
 unsigned Width = 
cast(CI.getArgOperand(2))->getZExtValue();
 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
+Info.ptrVal = CI.getArgOperand(1);
 return true;
   }
   }
@@ -1268,8 +1268,8 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo 
&Info,
 Info.opc = ISD::INTRINSIC_VOID;
 unsigned Width = cast(CI.getArgOperand(2))->getZExtValue();
 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
-Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
-  MachineMemOperand::MOVolatile;
+Info.ptrVal = CI.getArgOperand(1);
+Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
 return true;
   }
   case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
@@ -9084,7 +9084,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
 
 MachinePointerInfo StorePtrI = LoadPtrI;
-StorePtrI.V = nullptr;
+LoadPtrI.V = UndefValue::get(
+PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
+LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
 
 auto F = LoadMMO->getFlags() &
@@ -9162,6 +9164,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
 LoadPtrI.Offset = Op->getConstantOperandVal(5);
 MachinePointerInfo StorePtrI = LoadPtrI;
+LoadPtrI.V = UndefValue::get(
+PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
 LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
 auto F = LoadMMO->getFlags() &
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp 
b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index ede4841b8a5fd7..50ad22130e939e 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -31,6 +31,7 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/Sequence.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/InitializePasses.h"
@@ -121,8 +122,13 @@ enum RegisterMapping {
   SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets.
   AGPR_OFFSET = 256,  // Maximum programmable ArchVGPRs across all targets.
   SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
-  NUM_EXTRA_VGPRS = 1,// A reserved slot for DS.
-  EXTRA_VGPR_LDS = 0, // An artificial register to track LDS writes.
+  NUM_EXTRA_VGPRS = 9,// Reserved slots f

[clang] [clang-tools-extra] [compiler-rt] [flang] [llvm] [libcxx] [libc] [AMDGPU] Fix lack of LDS DMA check in the AA handling (PR #75249)

2023-12-13 Thread Stanislav Mekhanoshin via cfe-commits


https://github.com/rampitec updated 
https://github.com/llvm/llvm-project/pull/75249

>From 82606c4447e8aa8edde90ed420f1c48707967695 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin 
Date: Tue, 12 Dec 2023 13:45:47 -0800
Subject: [PATCH] [AMDGPU] Fix lack of LDS DMA check in the AA handling

SIInstrInfo::areMemAccessesTriviallyDisjoint does a DS offset
checks, but does not account for LDS DMA instructions. Added
these checks. Without it code falls through and returns true
which is wrong. As a result mayAlias would always return false
for LDS DMA and a regular LDS instruction or 2 LDS DMA instructions.

At the moment this is NFCI because we do not use this AA in a
context which may touch LDS DMA instructions. This is also
unreacheable now because of the ordered memory ref checks just
above in the function and LDS DMA is marked as volatile. This
volatile marking is removed in PR #75247, therefore I'd submit
this check before #75247.
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 4 ++--
 llvm/lib/Target/AMDGPU/SIInstrInfo.h   | 8 
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp 
b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d4e4526795f3b3..c485eb299d52a3 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3656,8 +3656,8 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const 
MachineInstr &MIa,
   // underlying address space, even if it was lowered to a different one,
   // e.g. private accesses lowered to use MUBUF instructions on a scratch
   // buffer.
-  if (isDS(MIa)) {
-if (isDS(MIb))
+  if (isDS(MIa) || isLDSDMA(MIa)) {
+if (isDS(MIb) || isLDSDMA(MIb))
   return checkInstOffsetsDoNotOverlap(MIa, MIb);
 
 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h 
b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index e794d8cf7cc220..97800bda775cda 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -546,6 +546,14 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
 return get(Opcode).TSFlags & SIInstrFlags::DS;
   }
 
+  static bool isLDSDMA(const MachineInstr &MI) {
+return isVALU(MI) && (isMUBUF(MI) || isFLAT(MI));
+  }
+
+  bool isLDSDMA(uint16_t Opcode) {
+return isVALU(Opcode) && (isMUBUF(Opcode) || isFLAT(Opcode));
+  }
+
   static bool isGWS(const MachineInstr &MI) {
 return MI.getDesc().TSFlags & SIInstrFlags::GWS;
   }

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang-tools-extra] [llvm] [libc] [flang] [compiler-rt] [libcxx] [clang] [AMDGPU] Fix lack of LDS DMA check in the AA handling (PR #75249)

2023-12-13 Thread Stanislav Mekhanoshin via cfe-commits


https://github.com/rampitec edited 
https://github.com/llvm/llvm-project/pull/75249
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[flang] [clang] [clang-tools-extra] [compiler-rt] [libcxx] [llvm] [libc] [AMDGPU] Fix lack of LDS DMA check in the AA handling (PR #75249)

2023-12-13 Thread Stanislav Mekhanoshin via cfe-commits



@@ -3656,8 +3656,8 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const 
MachineInstr &MIa,
   // underlying address space, even if it was lowered to a different one,
   // e.g. private accesses lowered to use MUBUF instructions on a scratch
   // buffer.
-  if (isDS(MIa)) {
-if (isDS(MIb))
+  if (isDS(MIa) || isLDSDMA(MIa)) {
+if (isDS(MIb) || isLDSDMA(MIb))
   return checkInstOffsetsDoNotOverlap(MIa, MIb);

rampitec wrote:

It does, even though it just bails. It goes down to 
getMemOperandsWithOffsetWidth and there it bails on the LDS DMA:
```
 449│ // Get appropriate operand, and compute width accordingly.
 450│ DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
 451│ if (DataOpIdx == -1)
 452│   DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
 453│ if (DataOpIdx == -1) // LDS DMA
 454│   return false;
```
In principle these offsets are analyzable. This is a typical store memop:
```
(dereferenceable store (s32) into `ptr addrspace(3) getelementptr inbounds 
(%llvm.amdgcn.kernel.buffer_load_lds_dword_2_ar
rays.lds.t, ptr addrspace(3) 
@llvm.amdgcn.kernel.buffer_load_lds_dword_2_arrays.lds, i32 0, i32 1)
```
But is you want I can bail right here.

https://github.com/llvm/llvm-project/pull/75249
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang-tools-extra] [llvm] [libc] [flang] [compiler-rt] [libcxx] [clang] [AMDGPU] Fix lack of LDS DMA check in the AA handling (PR #75249)

2023-12-13 Thread Stanislav Mekhanoshin via cfe-commits


https://github.com/rampitec updated 
https://github.com/llvm/llvm-project/pull/75249

>From 82606c4447e8aa8edde90ed420f1c48707967695 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin 
Date: Tue, 12 Dec 2023 13:45:47 -0800
Subject: [PATCH 1/2] [AMDGPU] Fix lack of LDS DMA check in the AA handling

SIInstrInfo::areMemAccessesTriviallyDisjoint does a DS offset
checks, but does not account for LDS DMA instructions. Added
these checks. Without it code falls through and returns true
which is wrong. As a result mayAlias would always return false
for LDS DMA and a regular LDS instruction or 2 LDS DMA instructions.

At the moment this is NFCI because we do not use this AA in a
context which may touch LDS DMA instructions. This is also
unreacheable now because of the ordered memory ref checks just
above in the function and LDS DMA is marked as volatile. This
volatile marking is removed in PR #75247, therefore I'd submit
this check before #75247.
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 4 ++--
 llvm/lib/Target/AMDGPU/SIInstrInfo.h   | 8 
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp 
b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d4e4526795f3b3..c485eb299d52a3 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3656,8 +3656,8 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const 
MachineInstr &MIa,
   // underlying address space, even if it was lowered to a different one,
   // e.g. private accesses lowered to use MUBUF instructions on a scratch
   // buffer.
-  if (isDS(MIa)) {
-if (isDS(MIb))
+  if (isDS(MIa) || isLDSDMA(MIa)) {
+if (isDS(MIb) || isLDSDMA(MIb))
   return checkInstOffsetsDoNotOverlap(MIa, MIb);
 
 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h 
b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index e794d8cf7cc220..97800bda775cda 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -546,6 +546,14 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
 return get(Opcode).TSFlags & SIInstrFlags::DS;
   }
 
+  static bool isLDSDMA(const MachineInstr &MI) {
+return isVALU(MI) && (isMUBUF(MI) || isFLAT(MI));
+  }
+
+  bool isLDSDMA(uint16_t Opcode) {
+return isVALU(Opcode) && (isMUBUF(Opcode) || isFLAT(Opcode));
+  }
+
   static bool isGWS(const MachineInstr &MI) {
 return MI.getDesc().TSFlags & SIInstrFlags::GWS;
   }

>From d8d9f3aab2d2fff2911a99d096685e78faf3d917 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin 
Date: Wed, 13 Dec 2023 11:42:10 -0800
Subject: [PATCH 2/2] Bail early in areMemAccessesTriviallyDisjoint

---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp 
b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 57eaefd41b2622..31669764144530 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3651,6 +3651,9 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const 
MachineInstr &MIa,
   if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
 return false;
 
+  if (isLDSDMA(MIa) || isLDSDMA(MIb))
+return false;
+
   // TODO: Should we check the address space from the MachineMemOperand? That
   // would allow us to distinguish objects we know don't alias based on the
   // underlying address space, even if it was lowered to a different one,

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[libc] [flang] [clang-tools-extra] [libcxx] [compiler-rt] [lld] [lldb] [clang] [llvm] [AMDGPU] Use alias info to relax waitcounts for LDS DMA (PR #74537)

2023-12-13 Thread Stanislav Mekhanoshin via cfe-commits


https://github.com/rampitec updated 
https://github.com/llvm/llvm-project/pull/74537

>From 7e382620cdc5999c645ed0746f242595f0294c58 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin 
Date: Mon, 4 Dec 2023 16:11:53 -0800
Subject: [PATCH 1/8] [AMDGPU] Use alias info to relax waitcounts for LDS DMA

LDA DMA loads increase VMCNT and a load from the LDS stored must
wait on this counter to only read memory after it is written.
Wait count insertion pass does not track memory dependencies, it
tracks register dependencies. To model the LDS dependency a
psuedo register is used in the scoreboard, acting like if LDS DMA
writes it and LDS load reads it.

This patch adds 8 more pseudo registers to use for independent LDS
locations if we can prove they are disjoint using alias analysis.

Fixes: SWDEV-433427
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp   |  16 +-
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp |  73 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp  |   4 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.h|   8 +
 llvm/lib/Target/AMDGPU/lds-dma-waits.ll | 154 
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll|   2 +
 6 files changed, 241 insertions(+), 16 deletions(-)
 create mode 100644 llvm/lib/Target/AMDGPU/lds-dma-waits.ll

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a7f4d63229b7ef..2e079404b087fa 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1128,11 +1128,10 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo 
&Info,
 MachineMemOperand::MOStore |
 MachineMemOperand::MODereferenceable;
 
-  // XXX - Should this be volatile without known ordering?
-  Info.flags |= MachineMemOperand::MOVolatile;
-
   switch (IntrID) {
   default:
+// XXX - Should this be volatile without known ordering?
+Info.flags |= MachineMemOperand::MOVolatile;
 break;
   case Intrinsic::amdgcn_raw_buffer_load_lds:
   case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
@@ -1140,6 +1139,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo 
&Info,
   case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
 unsigned Width = 
cast(CI.getArgOperand(2))->getZExtValue();
 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
+Info.ptrVal = CI.getArgOperand(1);
 return true;
   }
   }
@@ -1268,8 +1268,8 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo 
&Info,
 Info.opc = ISD::INTRINSIC_VOID;
 unsigned Width = cast(CI.getArgOperand(2))->getZExtValue();
 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
-Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
-  MachineMemOperand::MOVolatile;
+Info.ptrVal = CI.getArgOperand(1);
+Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
 return true;
   }
   case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
@@ -9084,7 +9084,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
 
 MachinePointerInfo StorePtrI = LoadPtrI;
-StorePtrI.V = nullptr;
+LoadPtrI.V = UndefValue::get(
+PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
+LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
 
 auto F = LoadMMO->getFlags() &
@@ -9162,6 +9164,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
 LoadPtrI.Offset = Op->getConstantOperandVal(5);
 MachinePointerInfo StorePtrI = LoadPtrI;
+LoadPtrI.V = UndefValue::get(
+PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
 LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
 auto F = LoadMMO->getFlags() &
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp 
b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index ede4841b8a5fd7..50ad22130e939e 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -31,6 +31,7 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/Sequence.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/InitializePasses.h"
@@ -121,8 +122,13 @@ enum RegisterMapping {
   SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets.
   AGPR_OFFSET = 256,  // Maximum programmable ArchVGPRs across all targets.
   SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
-  NUM_EXTRA_VGPRS = 1,// A reserved slot for DS.
-  EXTRA_VGPR_LDS = 0, // An artificial register to track LDS writes.
+  NUM_EXTRA_VGPRS = 9,// Reserved slots f

[clang-tools-extra] [lldb] [llvm] [libc] [flang] [lld] [compiler-rt] [libcxx] [clang] [AMDGPU] Use alias info to relax waitcounts for LDS DMA (PR #74537)

2023-12-13 Thread Stanislav Mekhanoshin via cfe-commits


https://github.com/rampitec updated 
https://github.com/llvm/llvm-project/pull/74537

>From 7e382620cdc5999c645ed0746f242595f0294c58 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin 
Date: Mon, 4 Dec 2023 16:11:53 -0800
Subject: [PATCH 1/9] [AMDGPU] Use alias info to relax waitcounts for LDS DMA

LDA DMA loads increase VMCNT and a load from the LDS stored must
wait on this counter to only read memory after it is written.
Wait count insertion pass does not track memory dependencies, it
tracks register dependencies. To model the LDS dependency a
psuedo register is used in the scoreboard, acting like if LDS DMA
writes it and LDS load reads it.

This patch adds 8 more pseudo registers to use for independent LDS
locations if we can prove they are disjoint using alias analysis.

Fixes: SWDEV-433427
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp   |  16 +-
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp |  73 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp  |   4 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.h|   8 +
 llvm/lib/Target/AMDGPU/lds-dma-waits.ll | 154 
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll|   2 +
 6 files changed, 241 insertions(+), 16 deletions(-)
 create mode 100644 llvm/lib/Target/AMDGPU/lds-dma-waits.ll

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a7f4d63229b7ef..2e079404b087fa 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1128,11 +1128,10 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo 
&Info,
 MachineMemOperand::MOStore |
 MachineMemOperand::MODereferenceable;
 
-  // XXX - Should this be volatile without known ordering?
-  Info.flags |= MachineMemOperand::MOVolatile;
-
   switch (IntrID) {
   default:
+// XXX - Should this be volatile without known ordering?
+Info.flags |= MachineMemOperand::MOVolatile;
 break;
   case Intrinsic::amdgcn_raw_buffer_load_lds:
   case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
@@ -1140,6 +1139,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo 
&Info,
   case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
 unsigned Width = 
cast(CI.getArgOperand(2))->getZExtValue();
 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
+Info.ptrVal = CI.getArgOperand(1);
 return true;
   }
   }
@@ -1268,8 +1268,8 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo 
&Info,
 Info.opc = ISD::INTRINSIC_VOID;
 unsigned Width = cast(CI.getArgOperand(2))->getZExtValue();
 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
-Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
-  MachineMemOperand::MOVolatile;
+Info.ptrVal = CI.getArgOperand(1);
+Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
 return true;
   }
   case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
@@ -9084,7 +9084,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
 
 MachinePointerInfo StorePtrI = LoadPtrI;
-StorePtrI.V = nullptr;
+LoadPtrI.V = UndefValue::get(
+PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
+LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
 
 auto F = LoadMMO->getFlags() &
@@ -9162,6 +9164,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
 LoadPtrI.Offset = Op->getConstantOperandVal(5);
 MachinePointerInfo StorePtrI = LoadPtrI;
+LoadPtrI.V = UndefValue::get(
+PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
 LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
 auto F = LoadMMO->getFlags() &
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp 
b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index ede4841b8a5fd7..50ad22130e939e 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -31,6 +31,7 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/Sequence.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/InitializePasses.h"
@@ -121,8 +122,13 @@ enum RegisterMapping {
   SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets.
   AGPR_OFFSET = 256,  // Maximum programmable ArchVGPRs across all targets.
   SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
-  NUM_EXTRA_VGPRS = 1,// A reserved slot for DS.
-  EXTRA_VGPR_LDS = 0, // An artificial register to track LDS writes.
+  NUM_EXTRA_VGPRS = 9,// Reserved slots f

[clang-tools-extra] [llvm] [libc] [flang] [compiler-rt] [libcxx] [clang] [AMDGPU] Fix lack of LDS DMA check in the AA handling (PR #75249)

2023-12-13 Thread Stanislav Mekhanoshin via cfe-commits


https://github.com/rampitec updated 
https://github.com/llvm/llvm-project/pull/75249

>From 82606c4447e8aa8edde90ed420f1c48707967695 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin 
Date: Tue, 12 Dec 2023 13:45:47 -0800
Subject: [PATCH 1/3] [AMDGPU] Fix lack of LDS DMA check in the AA handling

SIInstrInfo::areMemAccessesTriviallyDisjoint does a DS offset
checks, but does not account for LDS DMA instructions. Added
these checks. Without it code falls through and returns true
which is wrong. As a result mayAlias would always return false
for LDS DMA and a regular LDS instruction or 2 LDS DMA instructions.

At the moment this is NFCI because we do not use this AA in a
context which may touch LDS DMA instructions. This is also
unreacheable now because of the ordered memory ref checks just
above in the function and LDS DMA is marked as volatile. This
volatile marking is removed in PR #75247, therefore I'd submit
this check before #75247.
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 4 ++--
 llvm/lib/Target/AMDGPU/SIInstrInfo.h   | 8 
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp 
b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d4e4526795f3b3..c485eb299d52a3 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3656,8 +3656,8 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const 
MachineInstr &MIa,
   // underlying address space, even if it was lowered to a different one,
   // e.g. private accesses lowered to use MUBUF instructions on a scratch
   // buffer.
-  if (isDS(MIa)) {
-if (isDS(MIb))
+  if (isDS(MIa) || isLDSDMA(MIa)) {
+if (isDS(MIb) || isLDSDMA(MIb))
   return checkInstOffsetsDoNotOverlap(MIa, MIb);
 
 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h 
b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index e794d8cf7cc220..97800bda775cda 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -546,6 +546,14 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
 return get(Opcode).TSFlags & SIInstrFlags::DS;
   }
 
+  static bool isLDSDMA(const MachineInstr &MI) {
+return isVALU(MI) && (isMUBUF(MI) || isFLAT(MI));
+  }
+
+  bool isLDSDMA(uint16_t Opcode) {
+return isVALU(Opcode) && (isMUBUF(Opcode) || isFLAT(Opcode));
+  }
+
   static bool isGWS(const MachineInstr &MI) {
 return MI.getDesc().TSFlags & SIInstrFlags::GWS;
   }

>From d8d9f3aab2d2fff2911a99d096685e78faf3d917 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin 
Date: Wed, 13 Dec 2023 11:42:10 -0800
Subject: [PATCH 2/3] Bail early in areMemAccessesTriviallyDisjoint

---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp 
b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 57eaefd41b2622..31669764144530 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3651,6 +3651,9 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const 
MachineInstr &MIa,
   if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
 return false;
 
+  if (isLDSDMA(MIa) || isLDSDMA(MIb))
+return false;
+
   // TODO: Should we check the address space from the MachineMemOperand? That
   // would allow us to distinguish objects we know don't alias based on the
   // underlying address space, even if it was lowered to a different one,

>From 609be418b81f6ce8c9b323f60636af01f862a994 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin 
Date: Wed, 13 Dec 2023 11:45:50 -0800
Subject: [PATCH 3/3] Remove old code

---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp 
b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 31669764144530..d05d3c6996261f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3659,8 +3659,8 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const 
MachineInstr &MIa,
   // underlying address space, even if it was lowered to a different one,
   // e.g. private accesses lowered to use MUBUF instructions on a scratch
   // buffer.
-  if (isDS(MIa) || isLDSDMA(MIa)) {
-if (isDS(MIb) || isLDSDMA(MIb))
+  if (isDS(MIa)) {
+if (isDS(MIb))
   return checkInstOffsetsDoNotOverlap(MIa, MIb);
 
 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang-tools-extra] [llvm] [libc] [flang] [compiler-rt] [libcxx] [clang] [AMDGPU] Fix lack of LDS DMA check in the AA handling (PR #75249)

2023-12-13 Thread Stanislav Mekhanoshin via cfe-commits



@@ -3656,8 +3656,8 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const 
MachineInstr &MIa,
   // underlying address space, even if it was lowered to a different one,
   // e.g. private accesses lowered to use MUBUF instructions on a scratch
   // buffer.
-  if (isDS(MIa)) {
-if (isDS(MIb))
+  if (isDS(MIa) || isLDSDMA(MIa)) {
+if (isDS(MIb) || isLDSDMA(MIb))
   return checkInstOffsetsDoNotOverlap(MIa, MIb);

rampitec wrote:

Just bail early.

https://github.com/llvm/llvm-project/pull/75249
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[libc] [compiler-rt] [clang-tools-extra] [clang] [llvm] [flang] [libcxx] [AMDGPU] Fix lack of LDS DMA check in the AA handling (PR #75249)

2023-12-14 Thread Stanislav Mekhanoshin via cfe-commits


rampitec wrote:

Ping. This one seems obvious to me.

https://github.com/llvm/llvm-project/pull/75249
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[libc] [llvm] [libcxx] [clang-tools-extra] [clang] [compiler-rt] [flang] [AMDGPU] Fix lack of LDS DMA check in the AA handling (PR #75249)

2023-12-18 Thread Stanislav Mekhanoshin via cfe-commits


https://github.com/rampitec closed 
https://github.com/llvm/llvm-project/pull/75249
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[compiler-rt] [clang-tools-extra] [libcxx] [libc] [clang] [llvm] [flang] [AMDGPU] Produce better memoperand for LDS DMA (PR #75247)

2023-12-18 Thread Stanislav Mekhanoshin via cfe-commits


https://github.com/rampitec closed 
https://github.com/llvm/llvm-project/pull/75247
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [libcxx] [lldb] [clang-tools-extra] [libc] [compiler-rt] [flang] [lld] [AMDGPU] Use alias info to relax waitcounts for LDS DMA (PR #74537)

2023-12-18 Thread Stanislav Mekhanoshin via cfe-commits


https://github.com/rampitec updated 
https://github.com/llvm/llvm-project/pull/74537

>From 7e382620cdc5999c645ed0746f242595f0294c58 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin 
Date: Mon, 4 Dec 2023 16:11:53 -0800
Subject: [PATCH 1/9] [AMDGPU] Use alias info to relax waitcounts for LDS DMA

LDA DMA loads increase VMCNT and a load from the LDS stored must
wait on this counter to only read memory after it is written.
Wait count insertion pass does not track memory dependencies, it
tracks register dependencies. To model the LDS dependency a
psuedo register is used in the scoreboard, acting like if LDS DMA
writes it and LDS load reads it.

This patch adds 8 more pseudo registers to use for independent LDS
locations if we can prove they are disjoint using alias analysis.

Fixes: SWDEV-433427
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp   |  16 +-
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp |  73 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp  |   4 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.h|   8 +
 llvm/lib/Target/AMDGPU/lds-dma-waits.ll | 154 
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll|   2 +
 6 files changed, 241 insertions(+), 16 deletions(-)
 create mode 100644 llvm/lib/Target/AMDGPU/lds-dma-waits.ll

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a7f4d63229b7ef..2e079404b087fa 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1128,11 +1128,10 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo 
&Info,
 MachineMemOperand::MOStore |
 MachineMemOperand::MODereferenceable;
 
-  // XXX - Should this be volatile without known ordering?
-  Info.flags |= MachineMemOperand::MOVolatile;
-
   switch (IntrID) {
   default:
+// XXX - Should this be volatile without known ordering?
+Info.flags |= MachineMemOperand::MOVolatile;
 break;
   case Intrinsic::amdgcn_raw_buffer_load_lds:
   case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
@@ -1140,6 +1139,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo 
&Info,
   case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
 unsigned Width = 
cast(CI.getArgOperand(2))->getZExtValue();
 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
+Info.ptrVal = CI.getArgOperand(1);
 return true;
   }
   }
@@ -1268,8 +1268,8 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo 
&Info,
 Info.opc = ISD::INTRINSIC_VOID;
 unsigned Width = cast(CI.getArgOperand(2))->getZExtValue();
 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
-Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
-  MachineMemOperand::MOVolatile;
+Info.ptrVal = CI.getArgOperand(1);
+Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
 return true;
   }
   case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
@@ -9084,7 +9084,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
 
 MachinePointerInfo StorePtrI = LoadPtrI;
-StorePtrI.V = nullptr;
+LoadPtrI.V = UndefValue::get(
+PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
+LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
 
 auto F = LoadMMO->getFlags() &
@@ -9162,6 +9164,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
 LoadPtrI.Offset = Op->getConstantOperandVal(5);
 MachinePointerInfo StorePtrI = LoadPtrI;
+LoadPtrI.V = UndefValue::get(
+PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
 LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
 auto F = LoadMMO->getFlags() &
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp 
b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index ede4841b8a5fd7..50ad22130e939e 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -31,6 +31,7 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/Sequence.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/InitializePasses.h"
@@ -121,8 +122,13 @@ enum RegisterMapping {
   SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets.
   AGPR_OFFSET = 256,  // Maximum programmable ArchVGPRs across all targets.
   SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
-  NUM_EXTRA_VGPRS = 1,// A reserved slot for DS.
-  EXTRA_VGPR_LDS = 0, // An artificial register to track LDS writes.
+  NUM_EXTRA_VGPRS = 9,// Reserved slots f

[clang] [libcxx] [compiler-rt] [lldb] [libc] [llvm] [lld] [flang] [clang-tools-extra] [AMDGPU] Use alias info to relax waitcounts for LDS DMA (PR #74537)

2023-12-18 Thread Stanislav Mekhanoshin via cfe-commits


rampitec wrote:

All split off parts were merged and this patch is merged with main. Only 
waitcount insertion pass changes remained here.

https://github.com/llvm/llvm-project/pull/74537
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[compiler-rt] [libcxx] [flang] [libc] [lldb] [lld] [clang] [clang-tools-extra] [llvm] [AMDGPU] Use alias info to relax waitcounts for LDS DMA (PR #74537)

2023-12-19 Thread Stanislav Mekhanoshin via cfe-commits


rampitec wrote:

> How does this work in a case like this?
> 
> ```
> call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) 
> @lds.3, i32 4, i32 0, i32 0, i32 0, i32 0)
> call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) 
> %ptr, i32 4, i32 0, i32 0, i32 0, i32 0)
> %val.3 = load float, ptr addrspace(3) @lds.3, align 4
> ```
> 
> i.e.
> 
> * store to known lds address `@lds.3` (this will use slot 0 and another 
> slot e.g. slot 3?)
> 
> * store to unknown lds address (this will use slot 0?)
> 
> * load from known lds address `@lds.3` (this will use slot 3?)

It does not know the pointer, so it uses default slot 0 and waits till 0. I 
have to tell anyone interested here: before I even wrote this code it didn't 
know of the dependency and did not wait for anything at all. Everyone was happy.

https://github.com/llvm/llvm-project/pull/74537
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [clang-tools-extra] [compiler-rt] [llvm] [libcxx] [lldb] [lld] [libc] [flang] [AMDGPU] Use alias info to relax waitcounts for LDS DMA (PR #74537)

2023-12-19 Thread Stanislav Mekhanoshin via cfe-commits


rampitec wrote:

> Test case:
> 
> ```
> @lds.0 = internal addrspace(3) global [64 x float] poison, align 16
> @lds.1 = internal addrspace(3) global [64 x float] poison, align 16
> 
> declare void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr 
> addrspace(3) nocapture, i32 %size, i32 %voffset, i32 %soffset, i32 %offset, 
> i32 %aux)
> 
> define amdgpu_kernel void @f(<4 x i32> %rsrc, i32 %i1, i32 %i2, ptr 
> addrspace(1) %out, ptr addrspace(3) %ptr) {
> main_body:
>   call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr 
> addrspace(3) @lds.0, i32 4, i32 0, i32 0, i32 0, i32 0)
>   call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr 
> addrspace(3) %ptr, i32 4, i32 0, i32 0, i32 0, i32 0)
>   %gep.0 = getelementptr float, ptr addrspace(3) @lds.0, i32 %i1
>   %gep.1 = getelementptr float, ptr addrspace(3) @lds.1, i32 %i2
>   %val.0 = load volatile float, ptr addrspace(3) %gep.0, align 4
>   %val.1 = load volatile float, ptr addrspace(3) %gep.1, align 4
>   %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
>   store float %val.0, ptr addrspace(1) %out
>   store float %val.1, ptr addrspace(1) %out.gep.1
>   ret void
> }
> ```
> 
> Generates:
> 
> ```
>   s_load_dwordx8 s[4:11], s[0:1], 0x24
>   s_load_dword s2, s[0:1], 0x44
>   s_mov_b32 m0, 0
>   v_mov_b32_e32 v2, 0
>   s_waitcnt lgkmcnt(0)
>   buffer_load_dword off, s[4:7], 0 lds
>   s_mov_b32 m0, s2
>   s_lshl_b32 s0, s8, 2
>   buffer_load_dword off, s[4:7], 0 lds
>   s_lshl_b32 s1, s9, 2
>   v_mov_b32_e32 v0, s0
>   v_mov_b32_e32 v1, s1
>   s_waitcnt vmcnt(1)
>   ds_read_b32 v0, v0
>   s_waitcnt vmcnt(0)
>   ds_read_b32 v1, v1 offset:256
>   s_waitcnt lgkmcnt(0)
>   global_store_dwordx2 v2, v[0:1], s[10:11]
>   s_endpgm
> ```
> 
> The `s_waitcnt vmcnt(1)` seems incorrect, because the second 
> buffer-load-to-lds might clobber `@lds.0`.

This is still correct, pointer argument cannot alias module global. A pointer 
argument to a kernel is an LDS external requested by the host side, and host 
cannot see module LDS.

https://github.com/llvm/llvm-project/pull/74537
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [clang-tools-extra] [compiler-rt] [llvm] [libcxx] [lldb] [lld] [libc] [flang] [AMDGPU] Use alias info to relax waitcounts for LDS DMA (PR #74537)

2023-12-19 Thread Stanislav Mekhanoshin via cfe-commits

rampitec wrote:

> This is still correct, pointer argument cannot alias module global. A pointer 
> argument to a kernel is an LDS external requested by the host side, and host 
> cannot see module LDS.

I.e. that is really the point of the patch: if we are able to definitively 
identify an LDS object targeted by both load and store we only wait on that 
store or stores. And the only way to definitively identify the object at this 
stage is via alias.scope info which we are generating ourselves during module 
LDS lowering.

https://github.com/llvm/llvm-project/pull/74537
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [libc] [lldb] [lld] [llvm] [compiler-rt] [libcxx] [flang] [clang-tools-extra] [AMDGPU] Use alias info to relax waitcounts for LDS DMA (PR #74537)

2023-12-19 Thread Stanislav Mekhanoshin via cfe-commits


https://github.com/rampitec updated 
https://github.com/llvm/llvm-project/pull/74537

>From 7e382620cdc5999c645ed0746f242595f0294c58 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin 
Date: Mon, 4 Dec 2023 16:11:53 -0800
Subject: [PATCH 01/10] [AMDGPU] Use alias info to relax waitcounts for LDS DMA

LDA DMA loads increase VMCNT and a load from the LDS stored must
wait on this counter to only read memory after it is written.
Wait count insertion pass does not track memory dependencies, it
tracks register dependencies. To model the LDS dependency a
psuedo register is used in the scoreboard, acting like if LDS DMA
writes it and LDS load reads it.

This patch adds 8 more pseudo registers to use for independent LDS
locations if we can prove they are disjoint using alias analysis.

Fixes: SWDEV-433427
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp   |  16 +-
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp |  73 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp  |   4 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.h|   8 +
 llvm/lib/Target/AMDGPU/lds-dma-waits.ll | 154 
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll|   2 +
 6 files changed, 241 insertions(+), 16 deletions(-)
 create mode 100644 llvm/lib/Target/AMDGPU/lds-dma-waits.ll

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a7f4d63229b7ef..2e079404b087fa 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1128,11 +1128,10 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo 
&Info,
 MachineMemOperand::MOStore |
 MachineMemOperand::MODereferenceable;
 
-  // XXX - Should this be volatile without known ordering?
-  Info.flags |= MachineMemOperand::MOVolatile;
-
   switch (IntrID) {
   default:
+// XXX - Should this be volatile without known ordering?
+Info.flags |= MachineMemOperand::MOVolatile;
 break;
   case Intrinsic::amdgcn_raw_buffer_load_lds:
   case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
@@ -1140,6 +1139,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo 
&Info,
   case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
 unsigned Width = 
cast(CI.getArgOperand(2))->getZExtValue();
 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
+Info.ptrVal = CI.getArgOperand(1);
 return true;
   }
   }
@@ -1268,8 +1268,8 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo 
&Info,
 Info.opc = ISD::INTRINSIC_VOID;
 unsigned Width = cast(CI.getArgOperand(2))->getZExtValue();
 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
-Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
-  MachineMemOperand::MOVolatile;
+Info.ptrVal = CI.getArgOperand(1);
+Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
 return true;
   }
   case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
@@ -9084,7 +9084,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
 
 MachinePointerInfo StorePtrI = LoadPtrI;
-StorePtrI.V = nullptr;
+LoadPtrI.V = UndefValue::get(
+PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
+LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
 
 auto F = LoadMMO->getFlags() &
@@ -9162,6 +9164,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
 LoadPtrI.Offset = Op->getConstantOperandVal(5);
 MachinePointerInfo StorePtrI = LoadPtrI;
+LoadPtrI.V = UndefValue::get(
+PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
 LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
 auto F = LoadMMO->getFlags() &
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp 
b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index ede4841b8a5fd7..50ad22130e939e 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -31,6 +31,7 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/Sequence.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/InitializePasses.h"
@@ -121,8 +122,13 @@ enum RegisterMapping {
   SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets.
   AGPR_OFFSET = 256,  // Maximum programmable ArchVGPRs across all targets.
   SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
-  NUM_EXTRA_VGPRS = 1,// A reserved slot for DS.
-  EXTRA_VGPR_LDS = 0, // An artificial register to track LDS writes.
+  NUM_EXTRA_VGPRS = 9,// Reserved slots

[compiler-rt] [llvm] [libc] [libcxx] [lldb] [clang] [lld] [clang-tools-extra] [flang] [AMDGPU] Use alias info to relax waitcounts for LDS DMA (PR #74537)

2023-12-19 Thread Stanislav Mekhanoshin via cfe-commits


rampitec wrote:

> > This is still correct, pointer argument cannot alias module global. A 
> > pointer argument to a kernel is an LDS external requested by the host side, 
> > and host cannot see module LDS.
> 
> I.e. that is really the point of the patch: if we are able to definitively 
> identify an LDS object targeted by both load and store we only wait on that 
> store or stores. And the only way to definitively identify the object at this 
> stage is via alias.scope info which we are generating ourselves during module 
> LDS lowering.

I have added a check for the presence of alias scope info just in case we get a 
rogue AA. The testcase with a pointer argument still produces correct code with 
vmcnt(1).

https://github.com/llvm/llvm-project/pull/74537
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[lld] [compiler-rt] [flang] [libc] [libcxx] [llvm] [clang] [lldb] [clang-tools-extra] [AMDGPU] Use alias scope to relax waitcounts for LDS DMA (PR #75974)

2023-12-19 Thread Stanislav Mekhanoshin via cfe-commits


https://github.com/rampitec created 
https://github.com/llvm/llvm-project/pull/75974

LDA DMA loads increase VMCNT and a load from the LDS stored must
wait on this counter to only read memory after it is written.
Wait count insertion pass does not track memory dependencies, it
tracks register dependencies. To model the LDS dependency a pseudo
register is used in the scoreboard, acting like if LDS DMA writes
it and LDS load reads it.

This patch adds 8 more pseudo registers to use for independent LDS
locations if we can prove they are disjoint using alias scope info.

Fixes: SWDEV-433427

>From 7e382620cdc5999c645ed0746f242595f0294c58 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin 
Date: Mon, 4 Dec 2023 16:11:53 -0800
Subject: [PATCH 01/11] [AMDGPU] Use alias info to relax waitcounts for LDS DMA

LDA DMA loads increase VMCNT and a load from the LDS stored must
wait on this counter to only read memory after it is written.
Wait count insertion pass does not track memory dependencies, it
tracks register dependencies. To model the LDS dependency a
psuedo register is used in the scoreboard, acting like if LDS DMA
writes it and LDS load reads it.

This patch adds 8 more pseudo registers to use for independent LDS
locations if we can prove they are disjoint using alias analysis.

Fixes: SWDEV-433427
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp   |  16 +-
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp |  73 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp  |   4 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.h|   8 +
 llvm/lib/Target/AMDGPU/lds-dma-waits.ll | 154 
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll|   2 +
 6 files changed, 241 insertions(+), 16 deletions(-)
 create mode 100644 llvm/lib/Target/AMDGPU/lds-dma-waits.ll

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a7f4d63229b7ef..2e079404b087fa 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1128,11 +1128,10 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo 
&Info,
 MachineMemOperand::MOStore |
 MachineMemOperand::MODereferenceable;
 
-  // XXX - Should this be volatile without known ordering?
-  Info.flags |= MachineMemOperand::MOVolatile;
-
   switch (IntrID) {
   default:
+// XXX - Should this be volatile without known ordering?
+Info.flags |= MachineMemOperand::MOVolatile;
 break;
   case Intrinsic::amdgcn_raw_buffer_load_lds:
   case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
@@ -1140,6 +1139,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo 
&Info,
   case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
 unsigned Width = 
cast(CI.getArgOperand(2))->getZExtValue();
 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
+Info.ptrVal = CI.getArgOperand(1);
 return true;
   }
   }
@@ -1268,8 +1268,8 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo 
&Info,
 Info.opc = ISD::INTRINSIC_VOID;
 unsigned Width = cast(CI.getArgOperand(2))->getZExtValue();
 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
-Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
-  MachineMemOperand::MOVolatile;
+Info.ptrVal = CI.getArgOperand(1);
+Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
 return true;
   }
   case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
@@ -9084,7 +9084,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
 
 MachinePointerInfo StorePtrI = LoadPtrI;
-StorePtrI.V = nullptr;
+LoadPtrI.V = UndefValue::get(
+PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
+LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
 
 auto F = LoadMMO->getFlags() &
@@ -9162,6 +9164,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
 LoadPtrI.Offset = Op->getConstantOperandVal(5);
 MachinePointerInfo StorePtrI = LoadPtrI;
+LoadPtrI.V = UndefValue::get(
+PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
 LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
 auto F = LoadMMO->getFlags() &
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp 
b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index ede4841b8a5fd7..50ad22130e939e 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -31,6 +31,7 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/Sequence.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/Mac

[lld] [compiler-rt] [flang] [libc] [libcxx] [llvm] [clang] [lldb] [clang-tools-extra] [AMDGPU] Use alias info to relax waitcounts for LDS DMA (PR #74537)

2023-12-19 Thread Stanislav Mekhanoshin via cfe-commits


rampitec wrote:

Actually since I am only using alias scope I can avoid all alias analysis 
altogether and only compare alias scope. This does not need an analysis pass, 
calls to mayAlias, and in general simpler code. You can see an alternative PR 
if you like it more: https://github.com/llvm/llvm-project/pull/75974

https://github.com/llvm/llvm-project/pull/74537
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[lld] [compiler-rt] [flang] [libc] [libcxx] [llvm] [clang] [lldb] [clang-tools-extra] [AMDGPU] Use alias scope to relax waitcounts for LDS DMA (PR #75974)

2023-12-19 Thread Stanislav Mekhanoshin via cfe-commits


https://github.com/rampitec updated 
https://github.com/llvm/llvm-project/pull/75974

>From 7e382620cdc5999c645ed0746f242595f0294c58 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin 
Date: Mon, 4 Dec 2023 16:11:53 -0800
Subject: [PATCH 01/12] [AMDGPU] Use alias info to relax waitcounts for LDS DMA

LDA DMA loads increase VMCNT and a load from the LDS stored must
wait on this counter to only read memory after it is written.
Wait count insertion pass does not track memory dependencies, it
tracks register dependencies. To model the LDS dependency a
psuedo register is used in the scoreboard, acting like if LDS DMA
writes it and LDS load reads it.

This patch adds 8 more pseudo registers to use for independent LDS
locations if we can prove they are disjoint using alias analysis.

Fixes: SWDEV-433427
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp   |  16 +-
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp |  73 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp  |   4 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.h|   8 +
 llvm/lib/Target/AMDGPU/lds-dma-waits.ll | 154 
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll|   2 +
 6 files changed, 241 insertions(+), 16 deletions(-)
 create mode 100644 llvm/lib/Target/AMDGPU/lds-dma-waits.ll

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a7f4d63229b7ef..2e079404b087fa 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1128,11 +1128,10 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo 
&Info,
 MachineMemOperand::MOStore |
 MachineMemOperand::MODereferenceable;
 
-  // XXX - Should this be volatile without known ordering?
-  Info.flags |= MachineMemOperand::MOVolatile;
-
   switch (IntrID) {
   default:
+// XXX - Should this be volatile without known ordering?
+Info.flags |= MachineMemOperand::MOVolatile;
 break;
   case Intrinsic::amdgcn_raw_buffer_load_lds:
   case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
@@ -1140,6 +1139,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo 
&Info,
   case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
 unsigned Width = 
cast(CI.getArgOperand(2))->getZExtValue();
 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
+Info.ptrVal = CI.getArgOperand(1);
 return true;
   }
   }
@@ -1268,8 +1268,8 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo 
&Info,
 Info.opc = ISD::INTRINSIC_VOID;
 unsigned Width = cast(CI.getArgOperand(2))->getZExtValue();
 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
-Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
-  MachineMemOperand::MOVolatile;
+Info.ptrVal = CI.getArgOperand(1);
+Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
 return true;
   }
   case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
@@ -9084,7 +9084,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
 
 MachinePointerInfo StorePtrI = LoadPtrI;
-StorePtrI.V = nullptr;
+LoadPtrI.V = UndefValue::get(
+PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
+LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
 
 auto F = LoadMMO->getFlags() &
@@ -9162,6 +9164,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
 LoadPtrI.Offset = Op->getConstantOperandVal(5);
 MachinePointerInfo StorePtrI = LoadPtrI;
+LoadPtrI.V = UndefValue::get(
+PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
 LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
 auto F = LoadMMO->getFlags() &
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp 
b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index ede4841b8a5fd7..50ad22130e939e 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -31,6 +31,7 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/Sequence.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/InitializePasses.h"
@@ -121,8 +122,13 @@ enum RegisterMapping {
   SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets.
   AGPR_OFFSET = 256,  // Maximum programmable ArchVGPRs across all targets.
   SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
-  NUM_EXTRA_VGPRS = 1,// A reserved slot for DS.
-  EXTRA_VGPR_LDS = 0, // An artificial register to track LDS writes.
+  NUM_EXTRA_VGPRS = 9,// Reserved slots

[clang] [lldb] [flang] [llvm] [libc] [libcxx] [lld] [clang-tools-extra] [compiler-rt] [AMDGPU] Use alias scope to relax waitcounts for LDS DMA (PR #75974)

2023-12-19 Thread Stanislav Mekhanoshin via cfe-commits


rampitec wrote:

One thing to note: this alias.scope I am creating myself in the module LDS 
lowering, so I do exactly know what to expect. And then since there is this 
module LDS lowering even if any alias scope would be created before (which 
never happens, much less for an intrinsic call) it is already lost. It is lost 
along with the memory objects deleted by the lowering. That is the whole point 
of creating alias.scope metadata during the lowering: we are putting all module 
LDS into a single structure, so no AA will ever disambiguate it w/o alias scope 
info. In this situation I am the sole creator of the metadata, instructions 
carrying it, memory object accessed, and the consumer of this metadata.

At -O0 there will be no LDS lowering, but there will be no AA either. I do not 
see how to exploit it on practice.

One other thing to note here: there is also !noalias metadata generated in the 
very same place. I do not care about this because I am really searching for a 
store into this memory, which is a scope.

When I was writing code to generate this metadata I kept in mind exactly a 
scenario similar to this.

https://github.com/llvm/llvm-project/pull/75974
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [lldb] [lld] [flang] [clang-tools-extra] [libcxx] [llvm] [libc] [compiler-rt] [AMDGPU] Use alias scope to relax waitcounts for LDS DMA (PR #75974)

2023-12-19 Thread Stanislav Mekhanoshin via cfe-commits


rampitec wrote:

This is the place I am creating it: https://reviews.llvm.org/D108315

https://github.com/llvm/llvm-project/pull/75974
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[lld] [clang] [flang] [clang-tools-extra] [llvm] [lldb] [libc] [compiler-rt] [libcxx] [AMDGPU] Use alias info to relax waitcounts for LDS DMA (PR #74537)

2024-01-02 Thread Stanislav Mekhanoshin via cfe-commits


rampitec wrote:

Ping

https://github.com/llvm/llvm-project/pull/74537
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[libcxx] [flang] [libc] [clang-tools-extra] [lldb] [lld] [compiler-rt] [clang] [llvm] [AMDGPU] Use alias scope to relax waitcounts for LDS DMA (PR #75974)

2024-01-02 Thread Stanislav Mekhanoshin via cfe-commits


rampitec wrote:

Ping

https://github.com/llvm/llvm-project/pull/75974
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[llvm] [clang-tools-extra] [libcxx] [compiler-rt] [lld] [clang] [libc] [flang] [lldb] [AMDGPU] Use alias info to relax waitcounts for LDS DMA (PR #74537)

2024-01-03 Thread Stanislav Mekhanoshin via cfe-commits



@@ -703,8 +713,37 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
 setRegScore(RegNo, T, CurrScore);
   }
 }
-if (Inst.mayStore() && (TII->isDS(Inst) || mayWriteLDSThroughDMA(Inst))) {
-  setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
+if (Inst.mayStore() &&
+(TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) {
+  // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
+  // written can be accessed. A load from LDS to VMEM does not need a wait.
+  unsigned Slot = 0;
+  for (const auto *MemOp : Inst.memoperands()) {
+if (!MemOp->isStore() ||
+MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
+  continue;
+// Comparing just AA info does not guarantee memoperands are equal

rampitec wrote:

> PseudoSourceValue::mayAlias is supposed to report aliasing to possible IR 
> values. It looks like it's layered weirdly, and expects you to go through 
> MachineInstr::mayAlias. MachineInstr::mayAlias ought to be using the AA tags, 
> it shouldn't be a fundamental limitation

This is all PSV::mayAlias() does:
```
bool PseudoSourceValue::mayAlias(const MachineFrameInfo *) const {
  return !(isGOT() || isConstantPool() || isJumpTable());
}
```
No very useful. Then even to get to the AA tags check MI:mayAlias() shall go 
through all IR values' checks first.

https://github.com/llvm/llvm-project/pull/74537
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[llvm] [clang-tools-extra] [libcxx] [compiler-rt] [lld] [clang] [libc] [flang] [lldb] [AMDGPU] Use alias info to relax waitcounts for LDS DMA (PR #74537)

2024-01-03 Thread Stanislav Mekhanoshin via cfe-commits



@@ -703,8 +713,37 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
 setRegScore(RegNo, T, CurrScore);
   }
 }
-if (Inst.mayStore() && (TII->isDS(Inst) || mayWriteLDSThroughDMA(Inst))) {
-  setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
+if (Inst.mayStore() &&
+(TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) {
+  // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
+  // written can be accessed. A load from LDS to VMEM does not need a wait.
+  unsigned Slot = 0;
+  for (const auto *MemOp : Inst.memoperands()) {
+if (!MemOp->isStore() ||
+MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
+  continue;
+// Comparing just AA info does not guarantee memoperands are equal

rampitec wrote:

> It looks to me like it does use it if you pass UseTBAA=true. Not sure why 
> this would be a parameter in the first place

I am passing it, but to get to that check it shall first go through all Value 
and offset checks. Using AA is the last thing it does: 
https://llvm.org/doxygen/MachineInstr_8cpp_source.html#l01285

https://github.com/llvm/llvm-project/pull/74537
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[llvm] [clang-tools-extra] [libcxx] [compiler-rt] [lld] [clang] [libc] [flang] [lldb] [AMDGPU] Use alias info to relax waitcounts for LDS DMA (PR #74537)

2024-01-03 Thread Stanislav Mekhanoshin via cfe-commits



@@ -703,8 +713,37 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
 setRegScore(RegNo, T, CurrScore);
   }
 }
-if (Inst.mayStore() && (TII->isDS(Inst) || mayWriteLDSThroughDMA(Inst))) {
-  setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
+if (Inst.mayStore() &&
+(TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) {
+  // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
+  // written can be accessed. A load from LDS to VMEM does not need a wait.
+  unsigned Slot = 0;
+  for (const auto *MemOp : Inst.memoperands()) {
+if (!MemOp->isStore() ||
+MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
+  continue;
+// Comparing just AA info does not guarantee memoperands are equal

rampitec wrote:

> The values don't need to be identical, that's the point of the AA query. 
> BasicAA will parse through the offsets

I also think that values don't need to be identical. But that is what 
MI:mayAlias() does *before* it checks AA: 
https://llvm.org/doxygen/MachineInstr_8cpp_source.html#l01285

https://github.com/llvm/llvm-project/pull/74537
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

r353588 - [AMDGPU] Split dot-insts feature

2019-02-08 Thread Stanislav Mekhanoshin via cfe-commits

Author: rampitec
Date: Fri Feb  8 16:34:41 2019
New Revision: 353588

URL: http://llvm.org/viewvc/llvm-project?rev=353588&view=rev
Log:
[AMDGPU] Split dot-insts feature

Differential Revision: https://reviews.llvm.org/D57972

Modified:
cfe/trunk/include/clang/Basic/BuiltinsAMDGPU.def
cfe/trunk/lib/Basic/Targets/AMDGPU.cpp
cfe/trunk/test/CodeGenOpenCL/amdgpu-features.cl
cfe/trunk/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl

Modified: cfe/trunk/include/clang/Basic/BuiltinsAMDGPU.def
URL: 
http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/BuiltinsAMDGPU.def?rev=353588&r1=353587&r2=353588&view=diff
==
--- cfe/trunk/include/clang/Basic/BuiltinsAMDGPU.def (original)
+++ cfe/trunk/include/clang/Basic/BuiltinsAMDGPU.def Fri Feb  8 16:34:41 2019
@@ -145,13 +145,13 @@ TARGET_BUILTIN(__builtin_amdgcn_fmed3h,
 // Deep learning builtins.
 
//===--===//
 
-TARGET_BUILTIN(__builtin_amdgcn_fdot2, "fV2hV2hfIb", "nc", "dot-insts")
-TARGET_BUILTIN(__builtin_amdgcn_sdot2, "SiV2SsV2SsSiIb", "nc", "dot-insts")
-TARGET_BUILTIN(__builtin_amdgcn_udot2, "UiV2UsV2UsUiIb", "nc", "dot-insts")
-TARGET_BUILTIN(__builtin_amdgcn_sdot4, "SiSiSiSiIb", "nc", "dot-insts")
-TARGET_BUILTIN(__builtin_amdgcn_udot4, "UiUiUiUiIb", "nc", "dot-insts")
-TARGET_BUILTIN(__builtin_amdgcn_sdot8, "SiSiSiSiIb", "nc", "dot-insts")
-TARGET_BUILTIN(__builtin_amdgcn_udot8, "UiUiUiUiIb", "nc", "dot-insts")
+TARGET_BUILTIN(__builtin_amdgcn_fdot2, "fV2hV2hfIb", "nc", "dot2-insts")
+TARGET_BUILTIN(__builtin_amdgcn_sdot2, "SiV2SsV2SsSiIb", "nc", "dot2-insts")
+TARGET_BUILTIN(__builtin_amdgcn_udot2, "UiV2UsV2UsUiIb", "nc", "dot2-insts")
+TARGET_BUILTIN(__builtin_amdgcn_sdot4, "SiSiSiSiIb", "nc", "dot1-insts")
+TARGET_BUILTIN(__builtin_amdgcn_udot4, "UiUiUiUiIb", "nc", "dot2-insts")
+TARGET_BUILTIN(__builtin_amdgcn_sdot8, "SiSiSiSiIb", "nc", "dot1-insts")
+TARGET_BUILTIN(__builtin_amdgcn_udot8, "UiUiUiUiIb", "nc", "dot2-insts")
 
 
//===--===//
 // Special builtins.

Modified: cfe/trunk/lib/Basic/Targets/AMDGPU.cpp
URL: 
http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Basic/Targets/AMDGPU.cpp?rev=353588&r1=353587&r2=353588&view=diff
==
--- cfe/trunk/lib/Basic/Targets/AMDGPU.cpp (original)
+++ cfe/trunk/lib/Basic/Targets/AMDGPU.cpp Fri Feb  8 16:34:41 2019
@@ -136,7 +136,8 @@ bool AMDGPUTargetInfo::initFeatureMap(
 switch (llvm::AMDGPU::parseArchAMDGCN(CPU)) {
 case GK_GFX906:
   Features["dl-insts"] = true;
-  Features["dot-insts"] = true;
+  Features["dot1-insts"] = true;
+  Features["dot2-insts"] = true;
   LLVM_FALLTHROUGH;
 case GK_GFX909:
 case GK_GFX904:

Modified: cfe/trunk/test/CodeGenOpenCL/amdgpu-features.cl
URL: 
http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGenOpenCL/amdgpu-features.cl?rev=353588&r1=353587&r2=353588&view=diff
==
--- cfe/trunk/test/CodeGenOpenCL/amdgpu-features.cl (original)
+++ cfe/trunk/test/CodeGenOpenCL/amdgpu-features.cl Fri Feb  8 16:34:41 2019
@@ -11,7 +11,7 @@
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx601 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX601 %s
 
 // GFX904: 
"target-features"="+16-bit-insts,+ci-insts,+dpp,+fp32-denormals,+fp64-fp16-denormals,+gfx9-insts,+s-memrealtime,+vi-insts"
-// GFX906: 
"target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot-insts,+dpp,+fp32-denormals,+fp64-fp16-denormals,+gfx9-insts,+s-memrealtime,+vi-insts"
+// GFX906: 
"target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dpp,+fp32-denormals,+fp64-fp16-denormals,+gfx9-insts,+s-memrealtime,+vi-insts"
 // GFX801: 
"target-features"="+16-bit-insts,+ci-insts,+dpp,+fp32-denormals,+fp64-fp16-denormals,+s-memrealtime,+vi-insts"
 // GFX700: "target-features"="+ci-insts,+fp64-fp16-denormals,-fp32-denormals"
 // GFX600: "target-features"="+fp64-fp16-denormals,-fp32-denormals"

Modified: cfe/trunk/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl
URL: 
http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl?rev=353588&r1=353587&r2=353588&view=diff
==
--- cfe/trunk/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl (original)
+++ cfe/trunk/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl Fri Feb  8 
16:34:41 2019
@@ -12,24 +12,24 @@ kernel void builtins_amdgcn_dl_insts_err
 half2 v2hA, half2 v2hB, float fC,
 short2 v2ssA, short2 v2ssB, int siA, int siB, int siC,
 ushort2 v2usA, ushort2 v2usB, uint uiA, uint uiB, uint uiC) {
-  fOut[0] = __builtin_amdgcn_fdot2(v2hA, v2hB, fC, false); // 
expected-error {{'__builtin_amdgcn_fdot2' needs target

[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)

2024-02-12 Thread Stanislav Mekhanoshin via cfe-commits



@@ -79,17 +79,17 @@ define amdgpu_ps void @test_llvm_amdgcn_fdot2_bf16_bf16_sis(
 ; GFX11:   ; %bb.0: ; %entry
 ; GFX11-NEXT:v_mov_b32_e32 v2, s1
 ; GFX11-NEXT:s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:v_dot2_bf16_bf16 v2, s0, 0x10001, v2
+; GFX11-NEXT:v_dot2_bf16_bf16 v2, s0, 0x3f803f80, v2

rampitec wrote:

Well, this is unrelated to the patch itself. We can use inline 1.0 here, but 
then we must use op_sel_hi to produce it in the high half.

https://github.com/llvm/llvm-project/pull/80908
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)

2024-02-13 Thread Stanislav Mekhanoshin via cfe-commits



@@ -0,0 +1,8 @@
+// RUN: llvm-mc -arch=amdgcn -mcpu=gfx1100 -show-encoding %s | FileCheck %s

rampitec wrote:

You also need a disasm test for this.

https://github.com/llvm/llvm-project/pull/80908
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)

2024-02-13 Thread Stanislav Mekhanoshin via cfe-commits



@@ -1,8 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | 
FileCheck %s --check-prefixes=GFX11,SDAG-GFX11
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < 
%s | FileCheck %s --check-prefixes=GFX11,GISEL-GFX11

rampitec wrote:

Change 'RUN' with 'XUN' and add a comment instead.

https://github.com/llvm/llvm-project/pull/80908
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)

2024-02-13 Thread Stanislav Mekhanoshin via cfe-commits



@@ -488,6 +488,49 @@ static bool printImmediateFloat16(uint32_t Imm, const 
MCSubtargetInfo &STI,
   return true;
 }
 
+static bool printImmediateBFloat16(uint32_t Imm, const MCSubtargetInfo &STI,
+   raw_ostream &O) {
+  if (Imm == 0x3F80)
+O << "1.0";
+  else if (Imm == 0xBF80)
+O << "-1.0";
+  else if (Imm == 0x3F00)
+O << "0.5";
+  else if (Imm == 0xBF00)
+O << "-0.5";
+  else if (Imm == 0x4000)
+O << "2.0";
+  else if (Imm == 0xC000)
+O << "-2.0";
+  else if (Imm == 0x4080)
+O << "4.0";
+  else if (Imm == 0xC080)
+O << "-4.0";
+  else if (Imm == 0x3E22 && STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm))
+O << "0.15915494";
+  else
+return false;
+
+  return true;
+}
+
+void AMDGPUInstPrinter::printImmediateBF16(uint32_t Imm,
+   const MCSubtargetInfo &STI,
+   raw_ostream &O) {
+  int16_t SImm = static_cast(Imm);
+  if (isInlinableIntLiteral(SImm)) {
+O << SImm;
+return;
+  }
+
+  uint16_t HImm = static_cast(Imm);
+  if (printImmediateBFloat16(HImm, STI, O))
+return;
+
+  uint64_t Imm16 = static_cast(Imm);

rampitec wrote:

It's the same as HImm above.

https://github.com/llvm/llvm-project/pull/80908
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

1 2 3 >

1 - 100 of 201 matches

Mail list logo