[PATCH] D110089: [CUDA] Implement experimental support for texture lookups.

Artem Belevich via Phabricator via cfe-commits Tue, 21 Sep 2021 14:00:53 -0700

tra updated this revision to Diff 374028.
tra added a comment.

Minor cleanups



Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D110089/new/

https://reviews.llvm.org/D110089

Files:
  clang/include/clang/Basic/Builtins.def
  clang/include/clang/Sema/Sema.h
  clang/lib/AST/ExprConstant.cpp
  clang/lib/Headers/CMakeLists.txt
  clang/lib/Headers/__clang_cuda_runtime_wrapper.h
  clang/lib/Headers/__clang_cuda_texture_intrinsics.h
  clang/lib/Sema/SemaChecking.cpp

Index: clang/lib/Sema/SemaChecking.cpp
===================================================================
--- clang/lib/Sema/SemaChecking.cpp
+++ clang/lib/Sema/SemaChecking.cpp
@@ -1447,6 +1447,9 @@
   case llvm::Triple::riscv32:
   case llvm::Triple::riscv64:
     return CheckRISCVBuiltinFunctionCall(TI, BuiltinID, TheCall);
+  case llvm::Triple::nvptx:
+  case llvm::Triple::nvptx64:
+    return CheckNVPTXBuiltinFunctionCall(BuiltinID, TheCall);
   }
 }
 
@@ -3559,6 +3562,27 @@
   return false;
 }
 
+static bool CheckNVVMTextureOp(Sema &S, unsigned BuiltinID, CallExpr *TheCall) {
+  // First argument of the __nvvm_texture_op must be a string literal.
+  Expr *ArgExpr = TheCall->getArg(0);
+  Expr::EvalResult ArgResult;
+  if (!ArgExpr->EvaluateAsConstantExpr(ArgResult, S.Context))
+    return S.Diag(ArgExpr->getExprLoc(), diag::err_expr_not_string_literal)
+           << ArgExpr->getType();
+  return false;
+}
+
+bool Sema::CheckNVPTXBuiltinFunctionCall(unsigned BuiltinID,
+                                         CallExpr *TheCall) {
+  // position of memory order and scope arguments in the builtin
+  switch (BuiltinID) {
+  case Builtin::BI__nvvm_texture_op:
+    return CheckNVVMTextureOp(*this, BuiltinID, TheCall);
+  default:
+    return false;
+  }
+}
+
 bool Sema::CheckRISCVLMUL(CallExpr *TheCall, unsigned ArgNum) {
   llvm::APSInt Result;
 
Index: clang/lib/Headers/__clang_cuda_texture_intrinsics.h
===================================================================
--- /dev/null
+++ clang/lib/Headers/__clang_cuda_texture_intrinsics.h
@@ -0,0 +1,609 @@
+/*===--- __clang_cuda_texture_intrinsics.h - Device-side texture support ---===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __CLANG_CUDA_TEXTURE_INTRINSICS_H__
+#define __CLANG_CUDA_TEXTURE_INTRINSICS_H__
+#ifndef __CUDA__
+// NOLINTNEXTLINE(clang-diagnostic-error)
+#error "This file is for CUDA __compilation only."
+#endif
+
+#pragma push_macro("__Args")
+#pragma push_macro("__ID")
+#pragma push_macro("__ID")
+#pragma push_macro("__IDV")
+#pragma push_macro("__IMPL_2DGATHER")
+#pragma push_macro("__IMPL_ALIAS")
+#pragma push_macro("__IMPL_ALIASI")
+#pragma push_macro("__IMPL_F1")
+#pragma push_macro("__IMPL_F3")
+#pragma push_macro("__IMPL_F3N")
+#pragma push_macro("__IMPL_F3S")
+#pragma push_macro("__IMPL_S")
+#pragma push_macro("__IMPL_S3")
+#pragma push_macro("__IMPL_S3I")
+#pragma push_macro("__IMPL_S3N")
+#pragma push_macro("__IMPL_S3NI")
+#pragma push_macro("__IMPL_S3S")
+#pragma push_macro("__IMPL_S3SI")
+#pragma push_macro("__IMPL_SI")
+#pragma push_macro("__L")
+#pragma push_macro("__STRIP_PARENS")
+#pragma push_macro("__V4")
+#pragma push_macro("__V4P")
+
+#include <type_traits>
+
+namespace {
+
+template <int N> struct __Tag;
+#define __nv_tex_surf_handler(__op, __ptr, ...)                                \
+  __tex_fetch<__Tag<__nvvm_texture_op(__op)>>(__ptr, __VA_ARGS__)
+
+#define __ID(__op) __Tag<__nvvm_texture_op(__op)>
+// Tags for variants of particular operation. E.g. tex2Dgather can translate
+// into 4 different instructions.
+#define __IDV(__op, __variant)                                                 \
+  __Tag<10000 + __nvvm_texture_op(__op) * 100 + __variant>
+
+// Helper classes for figuring out the fetch type.
+template <class> struct __FT;
+// Fundamental types.
+template <> struct __FT<float> {
+  using __bt = float;
+  using __ft = float4;
+};
+template <> struct __FT<char> {
+  using __bt = char;
+  using __ft = int4;
+};
+template <> struct __FT<signed char> {
+  using __bt = signed char;
+  using __ft = int4;
+};
+template <> struct __FT<unsigned char> {
+  using __bt = unsigned char;
+  using __ft = uint4;
+};
+template <> struct __FT<short> {
+  using __bt = short;
+  using __ft = int4;
+};
+template <> struct __FT<ushort> {
+  using __bt = ushort;
+  using __ft = uint4;
+};
+template <> struct __FT<int> {
+  using __bt = int;
+  using __ft = int4;
+};
+template <> struct __FT<uint> {
+  using __bt = uint;
+  using __ft = uint4;
+};
+
+// Derived base/fetch types for N-element vectors.
+template <class __T> struct __FT {
+  using __bt = decltype(__T::x);
+  using __ft = typename __FT<__bt>::__ft;
+};
+
+// Classes that implement specific texture ops.
+template <class __op> struct __tex_fetch_v4;
+template <> struct __tex_fetch_v4<__Tag<-1>>; // Unknown op
+
+// Helper macros to strip parens from a macro argument.
+#define __Args(...) __VA_ARGS__
+#define __STRIP_PARENS(__X) __X
+#define __L(__X) __STRIP_PARENS(__Args __X)
+
+// Results are stored in a temp var __r.
+// isResident bool is pointed to by __ir
+// Asm args for return values. It's a 4-element vector
+#define __V4(__t)                                                              \
+  ("=" __t(__r.x), "=" __t(__r.y), "=" __t(__r.z), "=" __t(__r.w))
+// .. possibly combined with a predicate.
+#define __V4P(__t) (__L(__V4(__t)), "=h"(*__ir))
+
+#define __IMPL_F1(__rt, __dt, __args, __asm_op, __asm_outs, __asm_args)        \
+  template <>                                                                  \
+  __device__ __rt __run<__dt>(cudaTextureObject_t __obj, __L(__args)) {        \
+    __rt __r;                                                                  \
+    asm(__asm_op : __L(__asm_outs) : "l"(__obj), __L(__asm_args));             \
+    return __r;                                                                \
+  }
+
+#define __IMPL_F3(__args, __asm_op, __ctype, __asm_op_args, __asm_args)        \
+  __IMPL_F1(int4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args,   \
+            __V4("r"), __asm_args)                                             \
+  __IMPL_F1(uint4, uint4, __args, __asm_op ".u32." __ctype "\t" __asm_op_args, \
+            __V4("r"), __asm_args)                                             \
+  __IMPL_F1(float4, float4, __args,                                            \
+            __asm_op ".f32." __ctype "\t" __asm_op_args, __V4("f"),            \
+            __asm_args)
+
+#define __IMPL_F3S(__args, __asm_op, __ctype, __asm_op_args, __asm_args)       \
+  __IMPL_F1(int4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args,   \
+            __V4P("r"), __asm_args)                                            \
+  __IMPL_F1(uint4, uint4, __args, __asm_op ".u32." __ctype "\t" __asm_op_args, \
+            __V4P("r"), __asm_args)                                            \
+  __IMPL_F1(float4, float4, __args,                                            \
+            __asm_op ".f32." __ctype "\t" __asm_op_args, __V4P("f"),           \
+            __asm_args)
+
+#define __IMPL_F3N(__args, __asm_op, __ctype, __asm_op_args, __asm_args)       \
+  __IMPL_F1(float4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args, \
+            __V4("r"), __asm_args)                                             \
+  __IMPL_F1(float4, uint4, __args,                                             \
+            __asm_op ".u32." __ctype "\t" __asm_op_args, __V4("r"),            \
+            __asm_args)
+
+#define __IMPL_S3I(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
+  template <> struct __tex_fetch_v4<__op> {                                    \
+    template <class T>                                                         \
+    __device__ static T __run(cudaTextureObject_t __obj, __L(__args));         \
+    __IMPL_F3(__args, __asm_op, __ctype, __asm_op_args, __asm_args)            \
+  }
+
+#define __IMPL_S3(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args)  \
+  __IMPL_S3I(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args)
+
+// Same, but for sparse ops.
+#define __IMPL_S3SI(__op, __args, __asm_op, __ctype, __asm_op_args,            \
+                    __asm_args)                                                \
+  template <> struct __tex_fetch_v4<__op> {                                    \
+    template <class T>                                                         \
+    __device__ static T __run(cudaTextureObject_t __obj, __L(__args));         \
+    __IMPL_F3S(__args, __asm_op, __ctype, __asm_op_args, __asm_args)           \
+  }
+
+#define __IMPL_S3S(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
+  __IMPL_S3SI(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args)
+
+#define __IMPL_S3NI(__op, __args, __asm_op, __ctype, __asm_op_args,            \
+                    __asm_args)                                                \
+  template <> struct __tex_fetch_v4<__op> {                                    \
+    template <class T>                                                         \
+    __device__ static float4 __run(cudaTextureObject_t __obj, __L(__args));    \
+    __IMPL_F3N(__args, __asm_op, __ctype, __asm_op_args, __asm_args)           \
+  }
+
+#define __IMPL_S3N(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
+  __IMPL_S3NI(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args)
+
+#define __IMPL_SI(__op, __opn, __args, __asm_op, __ctype, __asm_op_args,       \
+                  __asm_args)                                                  \
+  __IMPL_S3I(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args);      \
+  __IMPL_S3NI(__opn, __args, __asm_op, __ctype, __asm_op_args, __asm_args)
+
+#define __IMPL_S(__op, __opn, __args, __asm_op, __ctype, __asm_op_args,        \
+                 __asm_args)                                                   \
+  __IMPL_SI(__ID(__op), __ID(__opn), __args, __asm_op, __ctype, __asm_op_args, \
+            __asm_args)
+
+#define __IMPL_ALIASI(__op, __opn)                                             \
+  template <> struct __tex_fetch_v4<__op> : __tex_fetch_v4<__opn> {}
+#define __IMPL_ALIAS(__op, __opn) __IMPL_ALIASI(__ID(__op), __ID(__opn))
+
+__IMPL_S("__tex1D_v2", "__tex1D_rmnf_v2", (float __x), "tex.1d.v4", "f32",
+         "{%0, %1, %2, %3}, [%4, {%5}];", ("f"(__x)));
+__IMPL_S("__tex1Dfetch_v2", "__tex1Dfetch_rmnf_v2", (int __x), "tex.1d.v4",
+         "s32", "{%0, %1, %2, %3}, [%4, {%5}];", ("r"(__x)));
+
+__IMPL_ALIAS("__itex1D", "__tex1D_v2");
+__IMPL_ALIAS("__itex1Dfetch", "__tex1Dfetch_v2");
+
+__IMPL_S("__tex1DGrad_v2", "__tex1DGrad_rmnf_v2",
+         (float __x, float __dPdx, float __dPdy), "tex.grad.1d.v4", "f32",
+         "{%0, %1, %2, %3}, [%4, {%5}], {%6}, {%7};",
+         ("f"(__x), "f"(__dPdx), "f"(__dPdy)));
+__IMPL_ALIAS("__itex1DGrad", "__tex1DGrad_v2");
+
+__IMPL_S("__tex1DLayered_v2", "__tex1DLayered_rmnf_v2",
+         (float __x, int __layer), "tex.a1d.v4", "f32",
+         "{%0, %1, %2, %3}, [%4, {%5, %6}];", ("r"(__layer), "f"(__x)));
+__IMPL_ALIAS("__itex1DLayered", "__tex1DLayered_v2");
+
+__IMPL_S("__tex1DLayeredGrad_v2", "__tex1DLayeredGrad_rmnf_v2",
+         (float __x, int __layer, float __dPdx, float __dPdy),
+         "tex.grad.a1d.v4", "f32",
+         "{%0, %1, %2, %3}, [%4, {%5, %6}], {%7}, {%8};",
+         ("r"(__layer), "f"(__x), "f"(__dPdx), "f"(__dPdy)));
+__IMPL_ALIAS("__itex1DLayeredGrad", "__tex1DLayeredGrad_v2");
+
+__IMPL_S("__tex1DLayeredLod_v2", "__tex1DLayeredLod_rmnf_v2",
+         (float __x, int __layer, float __level), "tex.level.a1d.v4", "f32",
+         "{%0, %1, %2, %3}, [%4, {%5, %6}], %7;",
+         ("r"(__layer), "f"(__x), "f"(__level)));
+__IMPL_ALIAS("__itex1DLayeredLod", "__tex1DLayeredLod_v2");
+
+__IMPL_S("__tex1DLod_v2", "__tex1DLod_rmnf_v2", (float __x, float __level),
+         "tex.level.1d.v4", "f32", "{%0, %1, %2, %3}, [%4, {%5}], %6;",
+         ("f"(__x), "f"(__level)));
+__IMPL_ALIAS("__itex1DLod", "__tex1DLod_v2");
+
+// 2D
+__IMPL_S("__tex2D_v2", "__tex2D_rmnf_v2", (float __x, float __y), "tex.2d.v4",
+         "f32", "{%0, %1, %2, %3}, [%4, {%5, %6}];", ("f"(__x), "f"(__y)));
+__IMPL_ALIAS("__itex2D", "__tex2D_v2");
+__IMPL_S3S("__itex2D_sparse", (float __x, float __y, unsigned char *__ir),
+           "{.reg .pred %%p0;\n\t"
+           "tex.2d.v4",
+           "f32",
+           "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}];\n\t"
+           " selp.u16 %4, 1, 0, %%p0; }",
+           ("f"(__x), "f"(__y)));
+
+__IMPL_S("__tex2DGrad_v2", "__tex2DGrad_rmnf_v2",
+         (float __x, float __y, const float2 *__dPdx, const float2 *__dPdy),
+         "tex.grad.2d.v4", "f32",
+         "{%0, %1, %2, %3}, [%4, {%5, %6}], {%7, %8}, {%9, %10};",
+         ("f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y), "f"(__dPdy->x),
+          "f"(__dPdy->y)));
+__IMPL_ALIAS("__itex2DGrad_v2", "__tex2DGrad_v2");
+__IMPL_S3S("__itex2DGrad_sparse",
+           (float __x, float __y, const float2 *__dPdx, const float2 *__dPdy,
+            unsigned char *__ir),
+           "{.reg .pred %%p0;\n\t"
+           "tex.grad.2d.v4",
+           "f32",
+           "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}], {%8, %9}, {%10, %11};\n\t"
+           "selp.u16 %4, 1, 0, %%p0; }",
+           ("f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y), "f"(__dPdy->x),
+            "f"(__dPdy->y)));
+
+__IMPL_S("__tex2DLayered_v2", "__tex2DLayered_rmnf_v2",
+         (float __x, float __y, int __layer), "tex.a2d.v4", "f32",
+         "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];",
+         ("r"(__layer), "f"(__x), "f"(__y)));
+__IMPL_ALIAS("__itex2DLayered", "__tex2DLayered_v2");
+__IMPL_S3S("__itex2DLayered_sparse",
+           (float __x, float __y, int __layer, unsigned char *__ir),
+           "{.reg .pred %%p0;\n\t"
+           "tex.a2d.v4",
+           "f32",
+           "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
+           "selp.u16 %4, 1, 0, %%p0; }",
+           ("r"(__layer), "f"(__x), "f"(__y)));
+
+__IMPL_S("__tex2DLayeredGrad_v2", "__tex2DLayeredGrad_rmnf_v2",
+         (float __x, float __y, int __layer, const float2 *__dPdx,
+          const float2 *__dPdy),
+         "tex.grad.a2d.v4", "f32",
+         "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], {%8, %9}, {%10, %11};",
+         ("r"(__layer), "f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y),
+          "f"(__dPdy->x), "f"(__dPdy->y)));
+__IMPL_ALIAS("__itex2DLayeredGrad_v2", "__tex2DLayeredGrad_v2");
+__IMPL_S3S(
+    "__itex2DLayeredGrad_sparse",
+    (float __x, float __y, int __layer, const float2 *__dPdx,
+     const float2 *__dPdy, unsigned char *__ir),
+    "{.reg .pred %%p0;\n\t"
+    "tex.grad.a2d.v4",
+    "f32",
+    "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], {%9, %10}, {%11, %12};\n\t"
+    "selp.u16 %4, 1, 0, %%p0; }",
+    ("r"(__layer), "f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y),
+     "f"(__dPdy->x), "f"(__dPdy->y)));
+
+__IMPL_S("__tex2DLayeredLod_v2", "__tex2DLayeredLod_rmnf_v2",
+         (float __x, float __y, int __layer, float __level), "tex.level.a2d.v4",
+         "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;",
+         ("r"(__layer), "f"(__x), "f"(__y), "f"(__level)));
+__IMPL_ALIAS("__itex2DLayeredLod", "__tex2DLayeredLod_v2");
+__IMPL_S3S("__itex2DLayeredLod_sparse",
+           (float __x, float __y, int __layer, float __level,
+            unsigned char *__ir),
+           "{.reg .pred %%p0;\n\t"
+           "tex.level.a2d.v4",
+           "f32",
+           "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], %9;\n\t"
+           "selp.u16 %4, 1, 0, %%p0; }",
+           ("r"(__layer), "f"(__x), "f"(__y), "f"(__level)));
+
+__IMPL_S("__tex2DLod_v2", "__tex2DLod_rmnf_v2",
+         (float __x, float __y, float __level), "tex.level.2d.v4", "f32",
+         "{%0, %1, %2, %3}, [%4, {%5, %6}], %7;",
+         ("f"(__x), "f"(__y), "f"(__level)));
+__IMPL_ALIAS("__itex2DLod", "__tex2DLod_v2");
+__IMPL_S3S("__itex2DLod_sparse",
+           (float __x, float __y, float __level, unsigned char *__ir),
+           "{.reg .pred %%p0;\n\t"
+           "tex.level.2d.v4",
+           "f32",
+           "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}], %8;\n\t"
+           "selp.u16 %4, 1, 0, %%p0; }",
+           ("f"(__x), "f"(__y), "f"(__level)));
+
+// 2D gather uses one of the four different instructions selected by __comp.  We
+// implement eash variant separately plus one umbrella call to pick one of the
+// variants.
+
+#define __IMPL_2DGATHER(variant, instr)                                        \
+  __IMPL_SI(__IDV("__tex2Dgather_v2", variant),                                \
+            __IDV("__tex2Dgather_rmnf_v2", variant),                           \
+            (float __x, float __y, int __comp), instr, "f32",                  \
+            "{%0, %1, %2, %3}, [%4, {%5, %6}];", ("f"(__x), "f"(__y)));        \
+  __IMPL_ALIASI(__IDV("__itex2Dgather", variant),                              \
+                __IDV("__tex2Dgather_v2", variant));                           \
+  __IMPL_S3SI(__IDV("__itex2Dgather_sparse", variant),                         \
+              (float __x, float __y, unsigned char *__ir, int __comp),         \
+              "{.reg .pred %%p0;\n\t" instr, "f32",                            \
+              "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}];\n\t"                     \
+              "selp.u16 %4, 1, 0, %%p0; }",                                    \
+              ("f"(__x), "f"(__y)));
+__IMPL_2DGATHER(0, "tld4.r.2d.v4");
+__IMPL_2DGATHER(1, "tld4.g.2d.v4");
+__IMPL_2DGATHER(2, "tld4.b.2d.v4");
+__IMPL_2DGATHER(3, "tld4.a.2d.v4");
+
+template <> struct __tex_fetch_v4<__ID("__tex2Dgather_v2")> {
+  template <class __T>
+  __device__ static __T __run(cudaTextureObject_t __obj, float __x, float __y,
+                              int __comp) {
+    switch (__comp) {
+    case 0:
+      return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 0)>::__run<__T>(
+          __obj, __x, __y, __comp);
+    case 1:
+      return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 1)>::__run<__T>(
+          __obj, __x, __y, __comp);
+    case 2:
+      return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 2)>::__run<__T>(
+          __obj, __x, __y, __comp);
+    case 3:
+      return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 3)>::__run<__T>(
+          __obj, __x, __y, __comp);
+    }
+  }
+};
+__IMPL_ALIAS("__itex2Dgather", "__tex2Dgather_v2");
+template <> struct __tex_fetch_v4<__ID("__tex2Dgather_rmnf_v2")> {
+  template <class __T>
+  __device__ static float4 __run(cudaTextureObject_t __obj, float __x,
+                                 float __y, int __comp) {
+    switch (__comp) {
+    case 0:
+      return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 0)>::__run<__T>(
+          __obj, __x, __y, __comp);
+    case 1:
+      return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 1)>::__run<__T>(
+          __obj, __x, __y, __comp);
+    case 2:
+      return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 2)>::__run<__T>(
+          __obj, __x, __y, __comp);
+    case 3:
+      return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 3)>::__run<__T>(
+          __obj, __x, __y, __comp);
+    }
+  }
+};
+template <> struct __tex_fetch_v4<__ID("__itex2Dgather_sparse")> {
+  template <class __T>
+  __device__ static __T __run(cudaTextureObject_t __obj, float __x, float __y,
+                              unsigned char *__ir, int __comp) {
+    switch (__comp) {
+    case 0:
+      return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 0)>::__run<__T>(
+          __obj, __x, __y, __ir, __comp);
+    case 1:
+      return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 1)>::__run<__T>(
+          __obj, __x, __y, __ir, __comp);
+    case 2:
+      return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 2)>::__run<__T>(
+          __obj, __x, __y, __ir, __comp);
+    case 3:
+      return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 3)>::__run<__T>(
+          __obj, __x, __y, __ir, __comp);
+    }
+  }
+};
+
+// 3D
+__IMPL_S("__tex3D_v2", "__tex3D_rmnf_v2", (float __x, float __y, float __z),
+         "tex.3d.v4", "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];",
+         ("f"(__x), "f"(__y), "f"(__z)));
+__IMPL_ALIAS("__itex3D", "__tex3D_v2");
+__IMPL_S3S("__itex3D_sparse",
+           (float __x, float __y, float __z, unsigned char *__ir),
+           "{.reg .pred %%p0;\n\t"
+           "tex.3d.v4",
+           "f32",
+           "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
+           "selp.u16 %4, 1, 0, %%p0; }",
+           ("f"(__x), "f"(__y), "f"(__z)));
+
+__IMPL_S("__tex3DGrad_v2", "__tex3DGrad_rmnf_v2",
+         (float __x, float __y, float __z, const float4 *__dPdx,
+          const float4 *__dPdy),
+         "tex.grad.3d.v4", "f32",
+         "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], "
+         "{%8, %9, %10, %10}, {%11, %12, %13, %13};",
+         ("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y),
+          "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z)));
+__IMPL_ALIAS("__itex3DGrad_v2", "__tex3DGrad_v2");
+__IMPL_S3S("__itex3DGrad_sparse",
+           (float __x, float __y, float __z, const float4 *__dPdx,
+            const float4 *__dPdy, unsigned char *__ir),
+           "{.reg .pred %%p0;\n\t"
+           "tex.grad.3d.v4",
+           "f32",
+           "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], "
+           "{%9, %10, %11, %11}, {%12, %13, %14, %14};\n\t"
+           "selp.u16 %4, 1, 0, %%p0; }",
+           ("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y),
+            "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z)));
+
+__IMPL_S("__tex3DLod_v2", "__tex3DLod_rmnf_v2",
+         (float __x, float __y, float __z, float __level), "tex.level.3d.v4",
+         "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;",
+         ("f"(__x), "f"(__y), "f"(__z), "f"(__level)));
+__IMPL_ALIAS("__itex3DLod", "__tex3DLod_v2");
+__IMPL_S3S("__itex3DLod_sparse",
+           (float __x, float __y, float __z, float __level,
+            unsigned char *__ir),
+           "{.reg .pred %%p0;\n\t"
+           "tex.level.3d.v4",
+           "f32",
+           "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], %9;\n\t"
+           "selp.u16 %4, 1, 0, %%p0; }",
+           ("f"(__x), "f"(__y), "f"(__z), "f"(__level)));
+
+// Cubemap
+__IMPL_S("__texCubemap_v2", "__texCubemap_rmnf_v2",
+         (float __x, float __y, float __z), "tex.cube.v4", "f32",
+         "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];",
+         ("f"(__x), "f"(__y), "f"(__z)));
+__IMPL_ALIAS("__itexCubemap", "__texCubemap_v2");
+__IMPL_S3S("__itexCubemap_sparse",
+           (float __x, float __y, float __z, unsigned char *__ir),
+           "{.reg .pred %%p0;\n\t"
+           "tex.cube.v4",
+           "f32",
+           "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
+           "selp.u16 %4, 1, 0, %%p0; }",
+           ("f"(__x), "f"(__y), "f"(__z)));
+__IMPL_S("__texCubemapGrad_v2", "__texCubemapGrad_rmnf_v2",
+         (float __x, float __y, float __z, const float4 *__dPdx,
+          const float4 *__dPdy),
+         "tex.grad.cube.v4", "f32",
+         "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], "
+         "{%8, %9, %10, %10}, {%11, %12, %13, %13};",
+         ("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y),
+          "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z)));
+__IMPL_ALIAS("__itexCubemapGrad_v2", "__texCubemapGrad_v2");
+__IMPL_S("__texCubemapLayered_v2", "__texCubemapLayered_rmnf_v2",
+         (float __x, float __y, float __z, int __layer), "tex.acube.v4", "f32",
+         "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];",
+         ("r"(__layer), "f"(__x), "f"(__y), "f"(__z)));
+__IMPL_ALIAS("__itexCubemapLayered", "__texCubemapLayered_v2");
+__IMPL_S("__texCubemapLayeredGrad_v2", "__texCubemapLayeredGrad_rmnf_v2",
+         (float __x, float __y, float __z, int __layer, const float4 *__dPdx,
+          const float4 *__dPdy),
+         "tex.grad.acube.v4", "f32",
+         "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], "
+         "{%9, %10, %11, %11}, {%12, %13, %14, %14};",
+         ("r"(__layer), "f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x),
+          "f"(__dPdx->y), "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y),
+          "f"(__dPdy->z)));
+__IMPL_ALIAS("__itexCubemapLayeredGrad_v2", "__texCubemapLayeredGrad_v2");
+__IMPL_S("__texCubemapLayeredLod_v2", "__texCubemapLayeredLod_rmnf_v2",
+         (float __x, float __y, float __z, int __layer, float __level),
+         "tex.level.acube.v4", "f32",
+         "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], %9;",
+         ("r"(__layer), "f"(__x), "f"(__y), "f"(__z), "f"(__level)));
+__IMPL_ALIAS("__itexCubemapLayeredLod", "__texCubemapLayeredLod_v2");
+__IMPL_S("__texCubemapLod_v2", "__texCubemapLod_rmnf_v2",
+         (float __x, float __y, float __z, float __level), "tex.level.cube.v4",
+         "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;",
+         ("f"(__x), "f"(__y), "f"(__z), "f"(__level)));
+__IMPL_ALIAS("__itexCubemapLod", "__texCubemapLod_v2");
+
+// Helper class for extracting slice of data from V4 fetch results.
+template <class __DestT, class __SrcT> struct __convert {
+  template <bool __IsConvertible = std::is_convertible<__DestT, __SrcT>::value,
+            int __N = sizeof(__DestT) / sizeof(typename __FT<__DestT>::__bt)>
+  __device__ static __DestT __run(__SrcT __v) {
+    return __v;
+  }
+  template <> __device__ static __DestT __run<false, 1>(__SrcT __v) {
+    return {__v.x};
+  }
+  template <> __device__ static __DestT __run<false, 2>(__SrcT __v) {
+    return {__v.x, __v.y};
+  }
+  template <> __device__ static __DestT __run<false, 3>(__SrcT __v) {
+    return {__v.x, __v.y, __v.z};
+  }
+  template <> __device__ static __DestT __run<false, 4>(__SrcT __v) {
+    return {__v.x, __v.y, __v.z, __v.w};
+  }
+};
+
+// __nv_tex_surf_handler("__tex...", &ret, cudaTextureObject_t handle, args...);
+//   Data type and return type are based on ret.
+template <class __op, class __T, class... __Args>
+__device__ static void __tex_fetch(__T *__ptr, cudaTextureObject_t __handle,
+                                   __Args... __args) {
+  using __FT = typename __FT<__T>::__ft;
+  *__ptr = __convert<__T, __FT>::__run(
+      __tex_fetch_v4<__op>::template __run<__FT>(__handle, __args...));
+}
+
+// texture<> objects get magically converted into a texture reference.  However,
+// there's no way to convert them to cudaTextureObject_t on C++ level. So, we
+// cheat a bit and use inline assembly to do it. It costs us an extra register
+// and a move, but that is easy for ptxas to optimize away.
+template <class __T>
+__device__ cudaTextureObject_t __tex_handle_to_obj(__T __handle) {
+  cudaTextureObject_t __obj;
+  asm("mov.b64 %0, %1; " : "=l"(__obj) : "l"(__handle));
+  return __obj;
+}
+
+// __nv_tex_surf_handler ("__tex...", &ret, textureReference, x);
+//   Data type and return type is based on ret.
+template <class __op, class __T, class __HandleT, class... __Args>
+__device__ static void __tex_fetch(__T *__ptr, __HandleT __handle,
+                                   __Args... __args) {
+  using __FT = typename __FT<__T>::__ft;
+  *__ptr =
+      __convert<__T, __FT>::__run(__tex_fetch_v4<__op>::template __run<__FT>(
+          __tex_handle_to_obj(__handle), __args...));
+}
+
+// __nv_tex_surf_handler ("__tex...", &type_dummy, &ret, texture<...>, x);
+// cudaReadModeNormalizedFloat fetches always return float4.
+template <class __op, class __DataT, class __RetT, int __TexT, class... __Args>
+__device__ static void
+__tex_fetch(__DataT *, __RetT *__ptr,
+            texture<__DataT, __TexT, cudaReadModeNormalizedFloat> __handle,
+            __Args... __args) {
+  using __FT = typename __FT<__DataT>::__ft;
+  *__ptr = __convert<__RetT, float4>::__run(
+      __tex_fetch_v4<__op>::template __run<__FT>(__tex_handle_to_obj(__handle),
+                                                 __args...));
+}
+
+// __nv_tex_surf_handler ("__tex...", &type_dummy, &ret, texture<...>, x);
+// For cudaReadModeElementType fetch return type is based on type_dummy.
+template <class __op, class __DataT, class __RetT, int __TexT, class... __Args>
+__device__ static void
+__tex_fetch(__DataT *, __RetT *__ptr,
+            texture<__DataT, __TexT, cudaReadModeElementType> __handle,
+            __Args... __args) {
+  using __FT = typename __FT<__DataT>::__ft;
+  *__ptr =
+      __convert<__RetT, __FT>::__run(__tex_fetch_v4<__op>::template __run<__FT>(
+          __tex_handle_to_obj(__handle), __args...));
+}
+} // namespace
+#pragma pop_macro("__Args")
+#pragma pop_macro("__ID")
+#pragma pop_macro("__ID")
+#pragma pop_macro("__IDV")
+#pragma pop_macro("__IMPL_2DGATHER")
+#pragma pop_macro("__IMPL_ALIAS")
+#pragma pop_macro("__IMPL_ALIASI")
+#pragma pop_macro("__IMPL_F1")
+#pragma pop_macro("__IMPL_F3")
+#pragma pop_macro("__IMPL_F3N")
+#pragma pop_macro("__IMPL_F3S")
+#pragma pop_macro("__IMPL_S")
+#pragma pop_macro("__IMPL_S3")
+#pragma pop_macro("__IMPL_S3I")
+#pragma pop_macro("__IMPL_S3N")
+#pragma pop_macro("__IMPL_S3NI")
+#pragma pop_macro("__IMPL_S3S")
+#pragma pop_macro("__IMPL_S3SI")
+#pragma pop_macro("__IMPL_SI")
+#pragma pop_macro("__L")
+#pragma pop_macro("__STRIP_PARENS")
+#pragma pop_macro("__V4")
+#pragma pop_macro("__V4P")
+#endif // __CLANG_CUDA_TEXTURE_INTRINSICS_H__
Index: clang/lib/Headers/__clang_cuda_runtime_wrapper.h
===================================================================
--- clang/lib/Headers/__clang_cuda_runtime_wrapper.h
+++ clang/lib/Headers/__clang_cuda_runtime_wrapper.h
@@ -64,9 +64,9 @@
 #endif
 
 // Make largest subset of device functions available during host
-// compilation -- SM_35 for the time being.
+// compilation.
 #ifndef __CUDA_ARCH__
-#define __CUDA_ARCH__ 350
+#define __CUDA_ARCH__ 9999
 #endif
 
 #include "__clang_cuda_builtin_vars.h"
@@ -330,7 +330,13 @@
 
 #pragma pop_macro("__host__")
 
+// clang-format off
+// __clang_cuda_texture_intrinsics.h must be included first in order to provide
+// implementation for __nv_tex_surf_handler that CUDA's headers depend on.
+#include <__clang_cuda_texture_intrinsics.h>
+#include "texture_fetch_functions.h"
 #include "texture_indirect_functions.h"
+// clang-format on
 
 // Restore state of __CUDA_ARCH__ and __THROW we had on entry.
 #pragma pop_macro("__CUDA_ARCH__")
Index: clang/lib/Headers/CMakeLists.txt
===================================================================
--- clang/lib/Headers/CMakeLists.txt
+++ clang/lib/Headers/CMakeLists.txt
@@ -47,6 +47,7 @@
   __clang_cuda_complex_builtins.h
   __clang_cuda_device_functions.h
   __clang_cuda_intrinsics.h
+  __clang_cuda_texture_intrinsics.h
   __clang_cuda_libdevice_declares.h
   __clang_cuda_math_forward_declares.h
   __clang_cuda_runtime_wrapper.h
Index: clang/lib/AST/ExprConstant.cpp
===================================================================
--- clang/lib/AST/ExprConstant.cpp
+++ clang/lib/AST/ExprConstant.cpp
@@ -11094,6 +11094,126 @@
   return EvaluateBuiltinClassifyType(E->getArg(0)->getType(), LangOpts);
 }
 
+static int EvaluateTextureOp(const CallExpr *E) {
+  // Sorted list of known operations stuuported by '__nv_tex_surf_handler'
+  static constexpr StringRef TextureOps[] = {"__isurf1DLayeredread",
+                                             "__isurf1DLayeredwrite_v2",
+                                             "__isurf1Dread",
+                                             "__isurf1Dwrite_v2",
+                                             "__isurf2DLayeredread",
+                                             "__isurf2DLayeredwrite_v2",
+                                             "__isurf2Dread",
+                                             "__isurf2Dwrite_v2",
+                                             "__isurf3Dread",
+                                             "__isurf3Dwrite_v2",
+                                             "__isurfCubemapLayeredread",
+                                             "__isurfCubemapLayeredwrite_v2",
+                                             "__isurfCubemapread",
+                                             "__isurfCubemapwrite_v2",
+                                             "__itex1D",
+                                             "__itex1DGrad",
+                                             "__itex1DLayered",
+                                             "__itex1DLayeredGrad",
+                                             "__itex1DLayeredLod",
+                                             "__itex1DLod",
+                                             "__itex1Dfetch",
+                                             "__itex2D",
+                                             "__itex2DGrad_sparse",
+                                             "__itex2DGrad_v2",
+                                             "__itex2DLayered",
+                                             "__itex2DLayeredGrad_sparse",
+                                             "__itex2DLayeredGrad_v2",
+                                             "__itex2DLayeredLod",
+                                             "__itex2DLayeredLod_sparse",
+                                             "__itex2DLayered_sparse",
+                                             "__itex2DLod",
+                                             "__itex2DLod_sparse",
+                                             "__itex2D_sparse",
+                                             "__itex2Dgather",
+                                             "__itex2Dgather_sparse",
+                                             "__itex3D",
+                                             "__itex3DGrad_sparse",
+                                             "__itex3DGrad_v2",
+                                             "__itex3DLod",
+                                             "__itex3DLod_sparse",
+                                             "__itex3D_sparse",
+                                             "__itexCubemap",
+                                             "__itexCubemapGrad_v2",
+                                             "__itexCubemapLayered",
+                                             "__itexCubemapLayeredGrad_v2",
+                                             "__itexCubemapLayeredLod",
+                                             "__itexCubemapLod",
+                                             "__surf1DLayeredread_v2",
+                                             "__surf1DLayeredwrite_v2",
+                                             "__surf1Dread_v2",
+                                             "__surf1Dwrite_v2",
+                                             "__surf2DLayeredread_v2",
+                                             "__surf2DLayeredwrite_v2",
+                                             "__surf2Dread_v2",
+                                             "__surf2Dwrite_v2",
+                                             "__surf3Dread_v2",
+                                             "__surf3Dwrite_v2",
+                                             "__surfCubemapLayeredread_v2",
+                                             "__surfCubemapLayeredwrite_v2",
+                                             "__surfCubemapread_v2",
+                                             "__surfCubemapwrite_v2",
+                                             "__tex1DGrad_rmnf_v2",
+                                             "__tex1DGrad_v2",
+                                             "__tex1DLayeredGrad_rmnf_v2",
+                                             "__tex1DLayeredGrad_v2",
+                                             "__tex1DLayeredLod_rmnf_v2",
+                                             "__tex1DLayeredLod_v2",
+                                             "__tex1DLayered_rmnf_v2",
+                                             "__tex1DLayered_v2",
+                                             "__tex1DLod_rmnf_v2",
+                                             "__tex1DLod_v2",
+                                             "__tex1D_rmnf_v2",
+                                             "__tex1D_v2",
+                                             "__tex1Dfetch_rmnf_v2",
+                                             "__tex1Dfetch_v2",
+                                             "__tex2DGrad_rmnf_v2",
+                                             "__tex2DGrad_v2",
+                                             "__tex2DLayeredGrad_rmnf_v2",
+                                             "__tex2DLayeredGrad_v2",
+                                             "__tex2DLayeredLod_rmnf_v2",
+                                             "__tex2DLayeredLod_v2",
+                                             "__tex2DLayered_rmnf_v2",
+                                             "__tex2DLayered_v2",
+                                             "__tex2DLod_rmnf_v2",
+                                             "__tex2DLod_v2",
+                                             "__tex2D_rmnf_v2",
+                                             "__tex2D_v2",
+                                             "__tex2Dgather_rmnf_v2",
+                                             "__tex2Dgather_v2",
+                                             "__tex3DGrad_rmnf_v2",
+                                             "__tex3DGrad_v2",
+                                             "__tex3DLod_rmnf_v2",
+                                             "__tex3DLod_v2",
+                                             "__tex3D_rmnf_v2",
+                                             "__tex3D_v2",
+                                             "__texCubemapGrad_rmnf_v2",
+                                             "__texCubemapGrad_v2",
+                                             "__texCubemapLayeredGrad_rmnf_v2",
+                                             "__texCubemapLayeredGrad_v2",
+                                             "__texCubemapLayeredLod_rmnf_v2",
+                                             "__texCubemapLayeredLod_v2",
+                                             "__texCubemapLayered_rmnf_v2",
+                                             "__texCubemapLayered_v2",
+                                             "__texCubemapLod_rmnf_v2",
+                                             "__texCubemapLod_v2",
+                                             "__texCubemap_rmnf_v2",
+                                             "__texCubemap_v2"
+
+  };
+  const StringLiteral *S =
+      dyn_cast<StringLiteral>(E->getArg(0)->IgnoreParenCasts());
+  auto I = llvm::lower_bound(TextureOps, S->getString());
+  if (I == std::end(TextureOps) || *I != S->getString()) {
+    return -1;
+  }
+  return I - std::begin(TextureOps);
+}
+
 /// EvaluateBuiltinConstantPForLValue - Determine the result of
 /// __builtin_constant_p when applied to the given pointer.
 ///
@@ -12123,6 +12243,8 @@
       return false;
     return Success(DidOverflow, E);
   }
+  case clang::Builtin::BI__nvvm_texture_op:
+    return Success((int)EvaluateTextureOp(E), E);
   }
 }
 
Index: clang/include/clang/Sema/Sema.h
===================================================================
--- clang/include/clang/Sema/Sema.h
+++ clang/include/clang/Sema/Sema.h
@@ -12625,6 +12625,7 @@
   bool CheckPPCBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
                                    CallExpr *TheCall);
   bool CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall);
+  bool CheckNVPTXBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall);
   bool CheckRISCVLMUL(CallExpr *TheCall, unsigned ArgNum);
   bool CheckRISCVBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
                                      CallExpr *TheCall);
Index: clang/include/clang/Basic/Builtins.def
===================================================================
--- clang/include/clang/Basic/Builtins.def
+++ clang/include/clang/Basic/Builtins.def
@@ -1648,6 +1648,8 @@
 
 // CUDA/HIP
 LANGBUILTIN(__builtin_get_device_side_mangled_name, "cC*.", "ncT", CUDA_LANG)
+// Builtin to convert texture operation name into a numeric value.
+LANGBUILTIN(__nvvm_texture_op, "icC*", "nc", CUDA_LANG)
 
 // Builtins for XRay
 BUILTIN(__xray_customevent, "vcC*z", "")

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D110089: [CUDA] Implement experimental support for texture lookups.

Reply via email to