jlebar created this revision.
jlebar added reviewers: tra, rnk.
jlebar added a subscriber: cfe-commits.
Setting this flag causes all functions are annotated with the
"nvvm-f32ftz" = "true" attribute.
In addition, we annotate the module with "nvvm-reflect-ftz" set
to 0 or 1, depending on whether -cuda-flush-denormals-to-zero is set.
This is read by the NVVMReflect pass.
http://reviews.llvm.org/D18671
Files:
include/clang/Basic/LangOptions.def
include/clang/Driver/CC1Options.td
include/clang/Driver/Options.td
lib/CodeGen/CGCall.cpp
lib/CodeGen/CodeGenModule.cpp
lib/Driver/ToolChains.cpp
lib/Frontend/CompilerInvocation.cpp
test/CodeGenCUDA/flush-denormals.cu
Index: test/CodeGenCUDA/flush-denormals.cu
===================================================================
--- /dev/null
+++ test/CodeGenCUDA/flush-denormals.cu
@@ -0,0 +1,23 @@
+// RUN: %clang_cc1 -fcuda-is-device \
+// RUN: -triple nvptx-nvidia-cuda -emit-llvm -o - %s | FileCheck %s -check-prefix NOFTZ
+// RUN: %clang_cc1 -fcuda-is-device -fcuda-flush-denormals-to-zero \
+// RUN: -triple nvptx-nvidia-cuda -emit-llvm -o - %s | FileCheck %s -check-prefix FTZ
+
+#include "Inputs/cuda.h"
+
+// Checks that device function calls get emitted with the "ntpvx-f32ftz"
+// attribute set to "true" when we compile CUDA device code with
+// -fcuda-flush-denormals-to-zero. Further, check that we reflect the presence
+// or absence of -fcuda-flush-denormals-to-zero in a module flag.
+
+// CHECK: define void @foo() #0
+extern "C" __device__ void foo() {}
+
+// FTZ: attributes #0 = {{.*}} "nvptx-f32ftz"="true"
+// NOFTZ-NOT: attributes #0 = {{.*}} "nvptx-f32ftz"
+
+// FTZ:!llvm.module.flags = !{[[MODFLAG:![0-9]+]]}
+// FTZ:[[MODFLAG]] = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+
+// NOFTZ:!llvm.module.flags = !{[[MODFLAG:![0-9]+]]}
+// NOFTZ:[[MODFLAG]] = !{i32 4, !"nvvm-reflect-ftz", i32 0}
Index: lib/Frontend/CompilerInvocation.cpp
===================================================================
--- lib/Frontend/CompilerInvocation.cpp
+++ lib/Frontend/CompilerInvocation.cpp
@@ -1563,6 +1563,10 @@
if (Args.hasArg(OPT_fno_cuda_host_device_constexpr))
Opts.CUDAHostDeviceConstexpr = 0;
+ if (Opts.CUDAIsDevice &&
+ (Args.hasArg(OPT_fcuda_flush_denormals_to_zero) || Opts.UnsafeFPMath))
+ Opts.CUDADeviceFlushDenormalsToZero = 1;
+
if (Opts.ObjC1) {
if (Arg *arg = Args.getLastArg(OPT_fobjc_runtime_EQ)) {
StringRef value = arg->getValue();
Index: lib/Driver/ToolChains.cpp
===================================================================
--- lib/Driver/ToolChains.cpp
+++ lib/Driver/ToolChains.cpp
@@ -4208,6 +4208,9 @@
Linux::addClangTargetOptions(DriverArgs, CC1Args);
CC1Args.push_back("-fcuda-is-device");
+ if (DriverArgs.hasArg(options::OPT_cuda_flush_denormals_to_zero))
+ CC1Args.push_back("-fcuda-flush-denormals-to-zero");
+
if (DriverArgs.hasArg(options::OPT_nocudalib))
return;
Index: lib/CodeGen/CodeGenModule.cpp
===================================================================
--- lib/CodeGen/CodeGenModule.cpp
+++ lib/CodeGen/CodeGenModule.cpp
@@ -472,6 +472,14 @@
getModule().addModuleFlag(llvm::Module::Override, "Cross-DSO CFI", 1);
}
+ if (LangOpts.CUDAIsDevice && getTarget().getTriple().isNVPTX()) {
+ // Indicate whether __nvvm_reflect should be configured to flush denormal
+ // floating point values to 0. (This corresponds to its "__CUDA_FTZ"
+ // property.)
+ getModule().addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz",
+ LangOpts.CUDADeviceFlushDenormalsToZero ? 1 : 0);
+ }
+
if (uint32_t PLevel = Context.getLangOpts().PICLevel) {
llvm::PICLevel::Level PL = llvm::PICLevel::Default;
switch (PLevel) {
Index: lib/CodeGen/CGCall.cpp
===================================================================
--- lib/CodeGen/CGCall.cpp
+++ lib/CodeGen/CGCall.cpp
@@ -1763,6 +1763,10 @@
// __syncthreads(), and so can't have certain optimizations applied around
// them). LLVM will remove this attribute where it safely can.
FuncAttrs.addAttribute(llvm::Attribute::Convergent);
+
+ // Respect -fcuda-flush-denormals-to-zero.
+ if (getLangOpts().CUDADeviceFlushDenormalsToZero)
+ FuncAttrs.addAttribute("nvptx-f32ftz", "true");
}
ClangToLLVMArgMapping IRFunctionArgs(getContext(), FI);
Index: include/clang/Driver/Options.td
===================================================================
--- include/clang/Driver/Options.td
+++ include/clang/Driver/Options.td
@@ -382,6 +382,8 @@
HelpText<"Enable device-side debug info generation. Disables ptxas optimizations.">;
def cuda_path_EQ : Joined<["--"], "cuda-path=">, Group<i_Group>,
HelpText<"CUDA installation path">;
+def cuda_flush_denormals_to_zero : Flag<["--"], "cuda-flush-denormals-to-zero">,
+ HelpText<"Flush denormal floating point values to zero in CUDA device mode.">;
def dA : Flag<["-"], "dA">, Group<d_Group>;
def dD : Flag<["-"], "dD">, Group<d_Group>, Flags<[CC1Option]>,
HelpText<"Print macro definitions in -E mode in addition to normal output">;
Index: include/clang/Driver/CC1Options.td
===================================================================
--- include/clang/Driver/CC1Options.td
+++ include/clang/Driver/CC1Options.td
@@ -693,6 +693,9 @@
HelpText<"Allow variadic functions in CUDA device code.">;
def fno_cuda_host_device_constexpr : Flag<["-"], "fno-cuda-host-device-constexpr">,
HelpText<"Don't treat unattributed constexpr functions as __host__ __device__.">;
+def fcuda_flush_denormals_to_zero : Flag<["-"], "fcuda-flush-denormals-to-zero">,
+ HelpText<"Flush denormal floating point values to zero in CUDA device code. "
+ "-menable-unsafe-fp-math implies this option.">;
//===----------------------------------------------------------------------===//
// OpenMP Options
Index: include/clang/Basic/LangOptions.def
===================================================================
--- include/clang/Basic/LangOptions.def
+++ include/clang/Basic/LangOptions.def
@@ -173,6 +173,7 @@
LANGOPT(CUDAIsDevice , 1, 0, "compiling for CUDA device")
LANGOPT(CUDAAllowVariadicFunctions, 1, 0, "allowing variadic functions in CUDA device code")
LANGOPT(CUDAHostDeviceConstexpr, 1, 1, "treating unattributed constexpr functions as __host__ __device__")
+LANGOPT(CUDADeviceFlushDenormalsToZero, 1, 0, "flushing denormals to zero")
LANGOPT(AssumeSaneOperatorNew , 1, 1, "implicit __attribute__((malloc)) for C++'s new operators")
LANGOPT(SizedDeallocation , 1, 0, "enable sized deallocation functions")
_______________________________________________
cfe-commits mailing list
[email protected]
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits