skc7 created this revision.
Herald added subscribers: jdoerfert, hiraditya.
Herald added a reviewer: aaron.ballman.
Herald added a project: All.
skc7 requested review of this revision.
Herald added projects: clang, LLVM.
Herald added subscribers: llvm-commits, cfe-commits.

This change introduces shuffle as function attribute in clang and llvm IR. It 
is used to identify __shfl_sync like cross-lane APIs [allows exchange of 
variable across all active threads]. At clang codegen, noundef attribute is 
skipped to arguments and return types for functions with shuffle attribute.

Shuffle attribute has been added as per suggestions/comments from review: 
D124158 <https://reviews.llvm.org/D124158>


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D125378

Files:
  clang/include/clang/Basic/Attr.td
  clang/include/clang/Basic/AttrDocs.td
  clang/lib/CodeGen/CGCall.cpp
  clang/lib/Headers/__clang_cuda_intrinsics.h
  clang/lib/Sema/SemaDeclAttr.cpp
  clang/test/CodeGenHIP/shuffle-attr-verify.hip
  clang/test/CodeGenHIP/shuffle-noundef-attr.hip
  clang/test/Misc/pragma-attribute-supported-attributes-list.test
  llvm/include/llvm/Bitcode/LLVMBitCodes.h
  llvm/include/llvm/IR/Attributes.td
  llvm/include/llvm/IR/Function.h
  llvm/lib/Bitcode/Reader/BitcodeReader.cpp
  llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
  llvm/lib/IR/Attributes.cpp
  llvm/lib/Transforms/Utils/CodeExtractor.cpp

Index: llvm/lib/Transforms/Utils/CodeExtractor.cpp
===================================================================
--- llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -960,6 +960,7 @@
       case Attribute::NoCfCheck:
       case Attribute::MustProgress:
       case Attribute::NoProfile:
+      case Attribute::Shuffle:
         break;
       // These attributes cannot be applied to functions.
       case Attribute::Alignment:
Index: llvm/lib/IR/Attributes.cpp
===================================================================
--- llvm/lib/IR/Attributes.cpp
+++ llvm/lib/IR/Attributes.cpp
@@ -1804,7 +1804,8 @@
           .addAttribute(Attribute::StructRet)
           .addAttribute(Attribute::ByRef)
           .addAttribute(Attribute::ElementType)
-          .addAttribute(Attribute::AllocatedPointer);
+          .addAttribute(Attribute::AllocatedPointer)
+          .addAttribute(Attribute::Shuffle);
   }
 
     // Attributes that only apply to pointers or vectors of pointers.
Index: llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
===================================================================
--- llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -778,6 +778,8 @@
   case Attribute::EmptyKey:
   case Attribute::TombstoneKey:
     llvm_unreachable("Trying to encode EmptyKey/TombstoneKey");
+  case Attribute::Shuffle:
+    return bitc::ATTR_KIND_SHUFFLE;
   }
 
   llvm_unreachable("Trying to encode unknown attribute");
Index: llvm/lib/Bitcode/Reader/BitcodeReader.cpp
===================================================================
--- llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -1632,6 +1632,8 @@
     return Attribute::MustProgress;
   case bitc::ATTR_KIND_HOT:
     return Attribute::Hot;
+  case bitc::ATTR_KIND_SHUFFLE:
+    return Attribute::Shuffle;
   }
 }
 
Index: llvm/include/llvm/IR/Function.h
===================================================================
--- llvm/include/llvm/IR/Function.h
+++ llvm/include/llvm/IR/Function.h
@@ -626,6 +626,12 @@
     return AttributeSets.getUWTableKind();
   }
 
+  /// Determine if the function is __shfl_sync like.
+  bool isShuffle() const {
+    return hasFnAttribute(Attribute::Shuffle);
+  }
+  void setShuffle() { addFnAttr(Attribute::Shuffle); }
+
   /// True if the ABI mandates (or the user requested) that this
   /// function be in a unwind table.
   bool hasUWTable() const {
Index: llvm/include/llvm/IR/Attributes.td
===================================================================
--- llvm/include/llvm/IR/Attributes.td
+++ llvm/include/llvm/IR/Attributes.td
@@ -300,6 +300,9 @@
 /// Function is required to make Forward Progress.
 def MustProgress : EnumAttr<"mustprogress", [FnAttr]>;
 
+/// Function is a __shfl_sync like API.
+def Shuffle : EnumAttr<"shuffle", [FnAttr]>;
+
 /// Target-independent string attributes.
 def LessPreciseFPMAD : StrBoolAttr<"less-precise-fpmad">;
 def NoInfsFPMath : StrBoolAttr<"no-infs-fp-math">;
Index: llvm/include/llvm/Bitcode/LLVMBitCodes.h
===================================================================
--- llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -684,6 +684,7 @@
   ATTR_KIND_NO_SANITIZE_BOUNDS = 79,
   ATTR_KIND_ALLOC_ALIGN = 80,
   ATTR_KIND_ALLOCATED_POINTER = 81,
+  ATTR_KIND_SHUFFLE = 82,
 };
 
 enum ComdatSelectionKindCodes {
Index: clang/test/Misc/pragma-attribute-supported-attributes-list.test
===================================================================
--- clang/test/Misc/pragma-attribute-supported-attributes-list.test
+++ clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -161,6 +161,7 @@
 // CHECK-NEXT: ScopedLockable (SubjectMatchRule_record)
 // CHECK-NEXT: Section (SubjectMatchRule_function, SubjectMatchRule_variable_is_global, SubjectMatchRule_objc_method, SubjectMatchRule_objc_property)
 // CHECK-NEXT: SetTypestate (SubjectMatchRule_function_is_member)
+// CHECK-NEXT: Shuffle (SubjectMatchRule_function)
 // CHECK-NEXT: SpeculativeLoadHardening (SubjectMatchRule_function, SubjectMatchRule_objc_method)
 // CHECK-NEXT: StandaloneDebug (SubjectMatchRule_record)
 // CHECK-NEXT: SwiftAsync (SubjectMatchRule_function, SubjectMatchRule_objc_method)
Index: clang/test/CodeGenHIP/shuffle-noundef-attr.hip
===================================================================
--- /dev/null
+++ clang/test/CodeGenHIP/shuffle-noundef-attr.hip
@@ -0,0 +1,48 @@
+// RUN: %clang_cc1 -no-opaque-pointers -triple amdgcn-amd-amdhsa -target-cpu gfx906 -x hip -fcuda-is-device -emit-llvm  %s \
+// RUN:   -o - | FileCheck %s
+
+#define __global__ __attribute__((global))
+#define __device__ __attribute__((device))
+#define __shuffle __attribute__((shuffle))
+#define HYPRE_WARP_SIZE 64
+
+static constexpr int warpSize = __AMDGCN_WAVEFRONT_SIZE;
+
+__device__ static inline unsigned int __lane_id() {
+    return  __builtin_amdgcn_mbcnt_hi(
+        -1, __builtin_amdgcn_mbcnt_lo(-1, 0));
+}
+
+__device__
+inline
+int __shfl(int var, int src_lane, int width = warpSize) {
+    int self = __lane_id();
+    int index = src_lane + (self & ~(width-1));
+    return __builtin_amdgcn_ds_bpermute(index<<2, var);
+}
+
+template <typename T>
+static __device__
+T __shuffle __shfl_sync(unsigned mask, T val, int src_line, int width=HYPRE_WARP_SIZE)
+{
+   return __shfl(val, src_line, width);
+}
+
+template <typename T>
+static __device__
+T __shfl_sync_1(unsigned mask, T val, int src_line, int width=HYPRE_WARP_SIZE)
+{
+   return __shfl(val, src_line, width);
+}
+
+// CHECK-LABEL: @_Z13shufflekernelv(
+// CHECK: call i32 @_ZL11__shfl_syncIiET_jS0_ii(i32 64, i32 %0, i32 0, i32 64)
+// CHECK: call noundef i32 @_ZL13__shfl_sync_1IiET_jS0_ii(i32 noundef 64, i32 noundef %1, i32 noundef 0, i32 noundef 64)
+__global__ void
+shufflekernel()
+{
+    int t;
+    int res, res1;
+    res = __shfl_sync(HYPRE_WARP_SIZE, t, 0);
+    res1 = __shfl_sync_1(HYPRE_WARP_SIZE, t, 0);
+}
\ No newline at end of file
Index: clang/test/CodeGenHIP/shuffle-attr-verify.hip
===================================================================
--- /dev/null
+++ clang/test/CodeGenHIP/shuffle-attr-verify.hip
@@ -0,0 +1,33 @@
+// RUN: %clang_cc1 -no-opaque-pointers -triple amdgcn-amd-amdhsa -target-cpu gfx906 -x hip -fcuda-is-device -emit-llvm  %s \
+// RUN:   -o - | FileCheck %s
+
+#define __global__ __attribute__((global))
+#define __device__ __attribute__((device))
+#define __shuffle __attribute__((shuffle))
+#define HYPRE_WARP_SIZE 64
+
+static constexpr int warpSize = __AMDGCN_WAVEFRONT_SIZE;
+
+__device__ static inline unsigned int __lane_id() {
+    return  __builtin_amdgcn_mbcnt_hi(
+        -1, __builtin_amdgcn_mbcnt_lo(-1, 0));
+}
+
+// CHECK: define linkonce_odr i32 @_Z11__shfl_synciii(i32 %var, i32 %src_lane, i32 %width) #[[attr1:[0-9]+]]
+__device__
+inline
+int __shuffle __shfl_sync(int var, int src_lane, int width = warpSize) {
+    int self = __lane_id();
+    int index = src_lane + (self & ~(width-1));
+    return __builtin_amdgcn_ds_bpermute(index<<2, var);
+}
+
+__global__ void
+shufflekernel()
+{
+    int t;
+    int res, res1;
+    res = __shfl_sync(HYPRE_WARP_SIZE, t, 0);
+}
+
+// CHECK-DAG: attributes #[[attr1]] = { {{[^}]*}}shuffle{{[^}]*}} }
\ No newline at end of file
Index: clang/lib/Sema/SemaDeclAttr.cpp
===================================================================
--- clang/lib/Sema/SemaDeclAttr.cpp
+++ clang/lib/Sema/SemaDeclAttr.cpp
@@ -8423,6 +8423,9 @@
   case ParsedAttr::AT_AMDGPUNumVGPR:
     handleAMDGPUNumVGPRAttr(S, D, AL);
     break;
+  case ParsedAttr::AT_Shuffle:
+    handleSimpleAttribute<ShuffleAttr>(S, D, AL);
+    break;
   case ParsedAttr::AT_AVRSignal:
     handleAVRSignalAttr(S, D, AL);
     break;
Index: clang/lib/Headers/__clang_cuda_intrinsics.h
===================================================================
--- clang/lib/Headers/__clang_cuda_intrinsics.h
+++ clang/lib/Headers/__clang_cuda_intrinsics.h
@@ -45,7 +45,7 @@
     _Static_assert(sizeof(__val) == sizeof(__Bits));                           \
     _Static_assert(sizeof(__Bits) == 2 * sizeof(int));                         \
     __Bits __tmp;                                                              \
-    memcpy(&__tmp, &__val, sizeof(__val));                                \
+    memcpy(&__tmp, &__val, sizeof(__val));                                     \
     __tmp.__a = ::__FnName(__tmp.__a, __offset, __width);                      \
     __tmp.__b = ::__FnName(__tmp.__b, __offset, __width);                      \
     long long __ret;                                                           \
@@ -100,27 +100,29 @@
 
 #if CUDA_VERSION >= 9000
 #if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300)
+#define __shuffle __attribute__((shuffle))
 // __shfl_sync_* variants available in CUDA-9
 #pragma push_macro("__MAKE_SYNC_SHUFFLES")
 #define __MAKE_SYNC_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic,       \
                              __Mask, __Type)                                   \
-  inline __device__ int __FnName(unsigned int __mask, int __val,               \
+  inline __device__ __shuffle int __FnName(unsigned int __mask, int __val,     \
                                  __Type __offset, int __width = warpSize) {    \
     return __IntIntrinsic(__mask, __val, __offset,                             \
                           ((warpSize - __width) << 8) | (__Mask));             \
   }                                                                            \
-  inline __device__ float __FnName(unsigned int __mask, float __val,           \
+  inline __device__ __shuffle float __FnName(unsigned int __mask, float __val, \
                                    __Type __offset, int __width = warpSize) {  \
     return __FloatIntrinsic(__mask, __val, __offset,                           \
                             ((warpSize - __width) << 8) | (__Mask));           \
   }                                                                            \
-  inline __device__ unsigned int __FnName(unsigned int __mask,                 \
+  inline __device__ __shuffle unsigned int __FnName(unsigned int __mask,       \
                                           unsigned int __val, __Type __offset, \
                                           int __width = warpSize) {            \
     return static_cast<unsigned int>(                                          \
         ::__FnName(__mask, static_cast<int>(__val), __offset, __width));       \
   }                                                                            \
-  inline __device__ long long __FnName(unsigned int __mask, long long __val,   \
+  inline __device__ __shuffle long long __FnName(unsigned int __mask,          \
+                                      long long __val,                         \
                                        __Type __offset,                        \
                                        int __width = warpSize) {               \
     struct __Bits {                                                            \
@@ -136,13 +138,13 @@
     memcpy(&__ret, &__tmp, sizeof(__tmp));                                     \
     return __ret;                                                              \
   }                                                                            \
-  inline __device__ unsigned long long __FnName(                               \
+  inline __device__ __shuffle unsigned long long __FnName(                     \
       unsigned int __mask, unsigned long long __val, __Type __offset,          \
       int __width = warpSize) {                                                \
     return static_cast<unsigned long long>(::__FnName(                         \
         __mask, static_cast<unsigned long long>(__val), __offset, __width));   \
   }                                                                            \
-  inline __device__ long __FnName(unsigned int __mask, long __val,             \
+  inline __device__ __shuffle long __FnName(unsigned int __mask, long __val,   \
                                   __Type __offset, int __width = warpSize) {   \
     _Static_assert(sizeof(long) == sizeof(long long) ||                        \
                    sizeof(long) == sizeof(int));                               \
@@ -154,13 +156,14 @@
           ::__FnName(__mask, static_cast<int>(__val), __offset, __width));     \
     }                                                                          \
   }                                                                            \
-  inline __device__ unsigned long __FnName(                                    \
+  inline __device__ __shuffle unsigned long __FnName(                          \
       unsigned int __mask, unsigned long __val, __Type __offset,               \
       int __width = warpSize) {                                                \
     return static_cast<unsigned long>(                                         \
         ::__FnName(__mask, static_cast<long>(__val), __offset, __width));      \
   }                                                                            \
-  inline __device__ double __FnName(unsigned int __mask, double __val,         \
+  inline __device__ __shuffle double __FnName(unsigned int __mask,             \
+                                    double __val,                              \
                                     __Type __offset, int __width = warpSize) { \
     long long __tmp;                                                           \
     _Static_assert(sizeof(__tmp) == sizeof(__val));                            \
Index: clang/lib/CodeGen/CGCall.cpp
===================================================================
--- clang/lib/CodeGen/CGCall.cpp
+++ clang/lib/CodeGen/CGCall.cpp
@@ -2035,6 +2035,19 @@
   return false;
 }
 
+static bool DetermineNoUndefForShuffle(const Decl *TargetDecl) {
+  if (!TargetDecl)
+    return true;
+
+  // Function has shuffle attribute. 
+  // Skip adding noundef in this case.
+  if (TargetDecl->hasAttr<ShuffleAttr>()) {
+    return false;
+  }
+
+  return true;
+}
+
 /// Construct the IR attribute list of a function or call.
 ///
 /// When adding an attribute, please consider where it should be handled:
@@ -2101,6 +2114,8 @@
       FuncAttrs.addAttribute(llvm::Attribute::NoDuplicate);
     if (TargetDecl->hasAttr<ConvergentAttr>())
       FuncAttrs.addAttribute(llvm::Attribute::Convergent);
+    if (TargetDecl->hasAttr<ShuffleAttr>())
+      FuncAttrs.addAttribute(llvm::Attribute::Shuffle);
 
     if (const FunctionDecl *Fn = dyn_cast<FunctionDecl>(TargetDecl)) {
       AddAttributesFromFunctionProtoType(
@@ -2298,8 +2313,10 @@
   // Determine if the return type could be partially undef
   if (CodeGenOpts.EnableNoundefAttrs && HasStrictReturn) {
     if (!RetTy->isVoidType() && RetAI.getKind() != ABIArgInfo::Indirect &&
-        DetermineNoUndef(RetTy, getTypes(), DL, RetAI))
-      RetAttrs.addAttribute(llvm::Attribute::NoUndef);
+        DetermineNoUndef(RetTy, getTypes(), DL, RetAI) &&
+        DetermineNoUndefForShuffle(TargetDecl)) {
+        RetAttrs.addAttribute(llvm::Attribute::NoUndef);
+    }
   }
 
   switch (RetAI.getKind()) {
@@ -2431,8 +2448,9 @@
 
     // Decide whether the argument we're handling could be partially undef
     if (CodeGenOpts.EnableNoundefAttrs &&
-        DetermineNoUndef(ParamType, getTypes(), DL, AI)) {
-      Attrs.addAttribute(llvm::Attribute::NoUndef);
+        DetermineNoUndef(ParamType, getTypes(), DL, AI) &&
+        DetermineNoUndefForShuffle(TargetDecl)) {
+        Attrs.addAttribute(llvm::Attribute::NoUndef);
     }
 
     // 'restrict' -> 'noalias' is done in EmitFunctionProlog when we
Index: clang/include/clang/Basic/AttrDocs.td
===================================================================
--- clang/include/clang/Basic/AttrDocs.td
+++ clang/include/clang/Basic/AttrDocs.td
@@ -1316,6 +1316,30 @@
   }];
 }
 
+def ShuffleDocs : Documentation {
+  let Category = DocCatFunction;
+  let Content = [{
+The ``shuffle`` attribute can be placed on a function declaration. It indicates
+that the call instructions of a function with this attribute can take undef
+arguments and is still valid.
+
+In languages HIP or CUDA, there are APIs like 
+T __shfl_sync(unsigned mask,T var, int srcLane, int width=warpSize);
+etc which permit exchanging of a variable between threads within a warp without 
+use of shared memory. These APIs allow variable var to be uninitialised in the program.
+Noundef analysis on such APIs can lead to ambiguous kernel execution. 
+So shuffle attribute on a function is used to skip adding noundef attribute to such APIs.
+
+Sample usage:
+.. code-block:: c
+
+  void shufflefunc(void) __attribute__((shuffle));
+  // Setting it as a C++11 attribute is also valid in a C++ program.
+  // void shufflefunc(void) [[clang::shuffle]];
+
+  }];
+}
+
 def NoSplitStackDocs : Documentation {
   let Category = DocCatFunction;
   let Content = [{
Index: clang/include/clang/Basic/Attr.td
===================================================================
--- clang/include/clang/Basic/Attr.td
+++ clang/include/clang/Basic/Attr.td
@@ -1774,6 +1774,13 @@
   let SimpleHandler = 1;
 }
 
+def Shuffle : InheritableAttr {
+  let Spellings = [Clang<"shuffle">];
+  let Subjects = SubjectList<[Function]>;
+  let Documentation = [ShuffleDocs];
+  let SimpleHandler = 1;
+}
+
 def NoInline : DeclOrStmtAttr {
   let Spellings = [GCC<"noinline">, CXX11<"clang", "noinline">,
                    C2x<"clang", "noinline">, Declspec<"noinline">];
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
  • [PATCH] D125378:... krishna chaitanya sankisa via Phabricator via cfe-commits

Reply via email to