arsenm updated this revision to Diff 238670.
arsenm added a comment.
Forgot clang parts
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D69878/new/
https://reviews.llvm.org/D69878
Files:
clang/include/clang/Basic/CodeGenOptions.def
clang/include/clang/Basic/CodeGenOptions.h
clang/include/clang/Driver/CC1Options.td
clang/include/clang/Driver/Options.td
clang/include/clang/Driver/ToolChain.h
clang/lib/Basic/Targets/AMDGPU.cpp
clang/lib/CodeGen/CGCall.cpp
clang/lib/CodeGen/CodeGenModule.cpp
clang/lib/Driver/ToolChains/AMDGPU.cpp
clang/lib/Driver/ToolChains/AMDGPU.h
clang/lib/Driver/ToolChains/Clang.cpp
clang/lib/Driver/ToolChains/Cuda.cpp
clang/lib/Driver/ToolChains/Cuda.h
clang/lib/Driver/ToolChains/HIP.cpp
clang/lib/Frontend/CompilerInvocation.cpp
clang/test/CodeGenCUDA/flush-denormals.cu
clang/test/CodeGenCUDA/propagate-metadata.cu
clang/test/CodeGenOpenCL/amdgpu-features.cl
clang/test/CodeGenOpenCL/denorms-are-zero.cl
clang/test/CodeGenOpenCL/gfx9-fp32-denorms.cl
clang/test/Driver/cl-denorms-are-zero.cl
clang/test/Driver/cuda-flush-denormals-to-zero.cu
clang/test/Driver/denormal-fp-math.c
clang/test/Driver/opencl.cl
llvm/docs/LangRef.rst
llvm/lib/CodeGen/MachineFunction.cpp
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
llvm/test/CodeGen/NVPTX/fast-math.ll
llvm/test/CodeGen/NVPTX/math-intrins.ll
llvm/test/CodeGen/NVPTX/sqrt-approx.ll
llvm/test/Transforms/InstCombine/NVPTX/nvvm-intrins.ll
Index: llvm/test/Transforms/InstCombine/NVPTX/nvvm-intrins.ll
===================================================================
--- llvm/test/Transforms/InstCombine/NVPTX/nvvm-intrins.ll
+++ llvm/test/Transforms/InstCombine/NVPTX/nvvm-intrins.ll
@@ -5,11 +5,11 @@
; hackery:
; RUN: cat %s > %t.ftz
-; RUN: echo 'attributes #0 = { "nvptx-f32ftz" = "true" }' >> %t.ftz
+; RUN: echo 'attributes #0 = { "denormal-fp-math-f32" = "preserve-sign" }' >> %t.ftz
; RUN: opt < %t.ftz -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=FTZ
; RUN: cat %s > %t.noftz
-; RUN: echo 'attributes #0 = { "nvptx-f32ftz" = "false" }' >> %t.noftz
+; RUN: echo 'attributes #0 = { "denormal-fp-math-f32" = "ieee" }' >> %t.noftz
; RUN: opt < %t.noftz -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=NOFTZ
; We handle nvvm intrinsics with ftz variants as follows:
Index: llvm/test/CodeGen/NVPTX/sqrt-approx.ll
===================================================================
--- llvm/test/CodeGen/NVPTX/sqrt-approx.ll
+++ llvm/test/CodeGen/NVPTX/sqrt-approx.ll
@@ -146,5 +146,5 @@
}
attributes #0 = { "unsafe-fp-math" = "true" }
-attributes #1 = { "nvptx-f32ftz" = "true" }
+attributes #1 = { "denormal-fp-math-f32" = "preserve-sign" }
attributes #2 = { "reciprocal-estimates" = "rsqrtf:1,rsqrtd:1,sqrtf:1,sqrtd:1" }
Index: llvm/test/CodeGen/NVPTX/math-intrins.ll
===================================================================
--- llvm/test/CodeGen/NVPTX/math-intrins.ll
+++ llvm/test/CodeGen/NVPTX/math-intrins.ll
@@ -289,4 +289,4 @@
}
attributes #0 = { nounwind readnone }
-attributes #1 = { "nvptx-f32ftz" = "true" }
+attributes #1 = { "denormal-fp-math-f32" = "preserve-sign" }
Index: llvm/test/CodeGen/NVPTX/fast-math.ll
===================================================================
--- llvm/test/CodeGen/NVPTX/fast-math.ll
+++ llvm/test/CodeGen/NVPTX/fast-math.ll
@@ -162,4 +162,4 @@
}
attributes #0 = { "unsafe-fp-math" = "true" }
-attributes #1 = { "nvptx-f32ftz" = "true" }
+attributes #1 = { "denormal-fp-math-f32" = "preserve-sign" }
Index: llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
===================================================================
--- llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -15,6 +15,7 @@
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/APSInt.h"
#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/FloatingPointMode.h"
#include "llvm/ADT/None.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/STLExtras.h"
@@ -1709,9 +1710,10 @@
// intrinsic, we don't have to look up any module metadata, as
// FtzRequirementTy will be FTZ_Any.)
if (Action.FtzRequirement != FTZ_Any) {
- bool FtzEnabled =
- II->getFunction()->getFnAttribute("nvptx-f32ftz").getValueAsString() ==
- "true";
+ StringRef Attr = II->getFunction()
+ ->getFnAttribute("denormal-fp-math-f32")
+ .getValueAsString();
+ bool FtzEnabled = parseDenormalFPAttribute(Attr) != DenormalMode::IEEE;
if (FtzEnabled != (Action.FtzRequirement == FTZ_MustBeOn))
return nullptr;
Index: llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
===================================================================
--- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -121,14 +121,10 @@
if (FtzEnabled.getNumOccurrences() > 0) {
// If nvptx-f32ftz is used on the command-line, always honor it
return FtzEnabled;
- } else {
- const Function &F = MF.getFunction();
- // Otherwise, check for an nvptx-f32ftz attribute on the function
- if (F.hasFnAttribute("nvptx-f32ftz"))
- return F.getFnAttribute("nvptx-f32ftz").getValueAsString() == "true";
- else
- return false;
}
+
+ return MF.getDenormalMode(APFloat::IEEEsingle()) ==
+ DenormalMode::PreserveSign;
}
static bool IsPTXVectorType(MVT VT) {
Index: llvm/lib/CodeGen/MachineFunction.cpp
===================================================================
--- llvm/lib/CodeGen/MachineFunction.cpp
+++ llvm/lib/CodeGen/MachineFunction.cpp
@@ -271,6 +271,16 @@
}
DenormalMode MachineFunction::getDenormalMode(const fltSemantics &FPType) const {
+ if (&FPType == &APFloat::IEEEsingle()) {
+ Attribute Attr = F.getFnAttribute("denormal-fp-math-f32");
+ StringRef Val = Attr.getValueAsString();
+ if (!Val.empty())
+ return parseDenormalFPAttribute(Val);
+
+ // If the f32 variant of the attribute isn't specified, try to use the
+ // generic one.
+ }
+
// TODO: Should probably avoid the connection to the IR and store directly
// in the MachineFunction.
Attribute Attr = F.getFnAttribute("denormal-fp-math");
Index: llvm/docs/LangRef.rst
===================================================================
--- llvm/docs/LangRef.rst
+++ llvm/docs/LangRef.rst
@@ -1818,6 +1818,30 @@
mode or that might alter the state of floating-point status flags that
might otherwise be set or cleared by calling this function. LLVM will
not introduce any new floating-point instructions that may trap.
+
+``"denormal-fp-math"``
+ This indicates the denormal (subnormal) handling that may be assumed
+ for the default floating-point environment. This may be one of
+ ``"ieee"``, ``"preserve-sign"``, or ``"positive-zero"``. If this
+ is attribute is not specified, the default is ``"ieee"``. If the
+ mode is ``"preserve-sign"``, or ``"positive-zero"``, denormal
+ outputs may be flushed to zero by standard floating point
+ operations. It is not mandated that flushing to zero occurs, but if
+ a denormal output is flushed to zero, it must respect the sign
+ mode. Not all targets support all modes. While this indicates the
+ expected floating point mode the function will be executed with,
+ this does not make any attempt to ensure the mode is
+ consistent. User or platform code is expected to set the floating
+ point mode appropriately before function entry.
+
+``"denormal-fp-math-f32"``
+ Same as ``"denormal-fp-math"``, but only controls the behavior of
+ the 32-bit float type (or vectors of 32-bit floats). If both are
+ are present, this overrides ``"denormal-fp-math"``. Not all targets
+ support separately setting the denormal mode per type, and no
+ attempt is made to diagnose unsupported uses. Currently this
+ attribute is respected by the AMDGPU and NVPTX backends.
+
``"thunk"``
This attribute indicates that the function will delegate to some other
function with a tail call. The prototype of a thunk should not be used for
Index: clang/test/Driver/opencl.cl
===================================================================
--- clang/test/Driver/opencl.cl
+++ clang/test/Driver/opencl.cl
@@ -32,7 +32,10 @@
// CHECK-FAST-RELAXED-MATH: "-cc1" {{.*}} "-cl-fast-relaxed-math"
// CHECK-MAD-ENABLE: "-cc1" {{.*}} "-cl-mad-enable"
// CHECK-NO-SIGNED-ZEROS: "-cc1" {{.*}} "-cl-no-signed-zeros"
-// CHECK-DENORMS-ARE-ZERO: "-cc1" {{.*}} "-cl-denorms-are-zero"
+
+// This is not forwarded
+// CHECK-DENORMS-ARE-ZERO-NOT: "-cl-denorms-are-zero"
+
// CHECK-ROUND-DIV: "-cc1" {{.*}} "-cl-fp32-correctly-rounded-divide-sqrt"
// CHECK-UNIFORM-WG: "-cc1" {{.*}} "-cl-uniform-work-group-size"
// CHECK-C99: error: invalid value 'c99' in '-cl-std=c99'
Index: clang/test/Driver/denormal-fp-math.c
===================================================================
--- clang/test/Driver/denormal-fp-math.c
+++ clang/test/Driver/denormal-fp-math.c
@@ -5,7 +5,7 @@
// RUN: %clang -### -target arm-unknown-linux-gnu -c %s -fdenormal-fp-math=ieee -fno-unsafe-math-optimizations -v 2>&1 | FileCheck -check-prefix=CHECK-NO-UNSAFE %s
// RUN: not %clang -target arm-unknown-linux-gnu -c %s -fdenormal-fp-math=foo -v 2>&1 | FileCheck -check-prefix=CHECK-INVALID %s
-// CHECK-IEEE: "-fdenormal-fp-math=ieee"
+// CHECK-IEEE: -fdenormal-fp-math=ieee
// CHECK-PS: "-fdenormal-fp-math=preserve-sign"
// CHECK-PZ: "-fdenormal-fp-math=positive-zero"
// CHECK-NO-UNSAFE-NOT: "-fdenormal-fp-math=ieee"
Index: clang/test/Driver/cuda-flush-denormals-to-zero.cu
===================================================================
--- /dev/null
+++ clang/test/Driver/cuda-flush-denormals-to-zero.cu
@@ -0,0 +1,13 @@
+// Checks that cuda compilation does the right thing when passed
+// -fcuda-flush-denormals-to-zero. This should be translated to
+// -fdenormal-fp-math-f32=preserve-sign
+
+// RUN: %clang -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell--cuda-gpu-arch=sm_20 -fcuda-flush-denormals-to-zero -nocudainc -nocudalib %s 2>&1 | FileCheck -check-prefix=FTZ %s
+// RUN: %clang -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell--cuda-gpu-arch=sm_20 -fno-cuda-flush-denormals-to-zero -nocudainc -nocudalib %s 2>&1 | FileCheck -check-prefix=NOFTZ %s
+// RUN: %clang -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell--cuda-gpu-arch=sm_10 -fcuda-flush-denormals-to-zero -nocudainc -nocudalib %s 2>&1 | FileCheck -check-prefix=FTZ %s
+// RUN: %clang -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell--cuda-gpu-arch=sm_10 -fno-cuda-flush-denormals-to-zero -nocudainc -nocudalib %s 2>&1 | FileCheck -check-prefix=NOFTZ %s
+
+// CPUFTZ-NOT: -fdenormal-fp-math
+
+// FTZ: "-fdenormal-fp-math-f32=preserve-sign"
+// NOFTZ: "-fdenormal-fp-math=ieee"
Index: clang/test/Driver/cl-denorms-are-zero.cl
===================================================================
--- /dev/null
+++ clang/test/Driver/cl-denorms-are-zero.cl
@@ -0,0 +1,20 @@
+// Slow FMAF and slow f32 denormals
+// RUN: %clang -### -target amdgcn--amdhsa -c -mcpu=pitcairn %s 2>&1 | FileCheck -check-prefixes=AMDGCN,AMDGCN-FLUSH %s
+// RUN: %clang -### -cl-denorms-are-zero -o - -target amdgcn--amdhsa -c -mcpu=pitcairn %s 2>&1 | FileCheck -check-prefixes=AMDGCN,AMDGCN-FLUSH %s
+
+// Fast FMAF, but slow f32 denormals
+// RUN: %clang -### -target amdgcn--amdhsa -c -mcpu=tahiti %s 2>&1 | FileCheck -check-prefixes=AMDGCN,AMDGCN-FLUSH %s
+// RUN: %clang -### -cl-denorms-are-zero -o - -target amdgcn--amdhsa -c -mcpu=tahiti %s 2>&1 | FileCheck -check-prefixes=AMDGCN,AMDGCN-FLUSH %s
+
+// Fast F32 denormals, but slow FMAF
+// RUN: %clang -### -target amdgcn--amdhsa -c -mcpu=fiji %s 2>&1 | FileCheck -check-prefixes=AMDGCN,AMDGCN-FLUSH %s
+// RUN: %clang -### -cl-denorms-are-zero -o - -target amdgcn--amdhsa -c -mcpu=fiji %s 2>&1 | FileCheck -check-prefixes=AMDGCN,AMDGCN-FLUSH %s
+
+// Fast F32 denormals and fast FMAF
+// RUN: %clang -### -target amdgcn--amdhsa -c -mcpu=gfx900 %s 2>&1 | FileCheck -check-prefixes=AMDGCN,AMDGCN-DENORM %s
+// RUN: %clang -### -cl-denorms-are-zero -o - -target amdgcn--amdhsa -c -mcpu=gfx900 %s 2>&1 | FileCheck -check-prefixes=AMDGCN,AMDGCN-FLUSH %s
+
+// AMDGCN-FLUSH: "-fdenormal-fp-math-f32=preserve-sign"
+
+// This should be omitted and default to ieee
+// AMDGCN-DENORM-NOT: "-fdenormal-fp-math-f32"
Index: clang/test/CodeGenOpenCL/gfx9-fp32-denorms.cl
===================================================================
--- clang/test/CodeGenOpenCL/gfx9-fp32-denorms.cl
+++ /dev/null
@@ -1,13 +0,0 @@
-// REQUIRES: amdgpu-registered-target
-
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx900 -S -emit-llvm -o - %s | FileCheck --check-prefix=DEFAULT %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx900 -S -emit-llvm -o - -target-feature +fp32-denormals %s | FileCheck --check-prefix=FEATURE_FP32_DENORMALS_ON %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx900 -S -emit-llvm -o - -target-feature -fp32-denormals %s | FileCheck --check-prefix=FEATURE_FP32_DENORMALS_OFF %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx900 -S -emit-llvm -o - -cl-denorms-are-zero %s | FileCheck --check-prefix=OPT_DENORMS_ARE_ZERO %s
-
-// DEFAULT: +fp32-denormals
-// FEATURE_FP32_DENORMALS_ON: +fp32-denormals
-// FEATURE_FP32_DENORMALS_OFF: -fp32-denormals
-// OPT_DENORMS_ARE_ZERO: -fp32-denormals
-
-kernel void gfx9_fp32_denorms() {}
Index: clang/test/CodeGenOpenCL/denorms-are-zero.cl
===================================================================
--- clang/test/CodeGenOpenCL/denorms-are-zero.cl
+++ /dev/null
@@ -1,45 +0,0 @@
-// RUN: %clang_cc1 -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -emit-llvm -cl-denorms-are-zero -o - %s | FileCheck -check-prefix=DENORM-ZERO %s
-
-// Slow FMAF and slow f32 denormals
-// RUN: %clang_cc1 -emit-llvm -o - -triple amdgcn--amdhsa -target-cpu pitcairn %s | FileCheck -check-prefixes=AMDGCN,AMDGCN-FLUSH %s
-// RUN: %clang_cc1 -emit-llvm -cl-denorms-are-zero -o - -triple amdgcn--amdhsa -target-cpu pitcairn %s | FileCheck -check-prefixes=AMDGCN,AMDGCN-FLUSH-OPT %s
-
-// Fast FMAF, but slow f32 denormals
-// RUN: %clang_cc1 -emit-llvm -o - -triple amdgcn--amdhsa -target-cpu tahiti %s | FileCheck -check-prefixes=AMDGCN,AMDGCN-FLUSH %s
-// RUN: %clang_cc1 -emit-llvm -cl-denorms-are-zero -o - -triple amdgcn--amdhsa -target-cpu tahiti %s | FileCheck -check-prefixes=AMDGCN,AMDGCN-FLUSH-OPT %s
-
-// Fast F32 denormals, but slow FMAF
-// RUN: %clang_cc1 -emit-llvm -o - -triple amdgcn--amdhsa -target-cpu fiji %s | FileCheck -check-prefixes=AMDGCN,AMDGCN-FLUSH %s
-// RUN: %clang_cc1 -emit-llvm -cl-denorms-are-zero -o - -triple amdgcn--amdhsa -target-cpu fiji %s | FileCheck -check-prefixes=AMDGCN,AMDGCN-FLUSH-OPT %s
-
-// Fast F32 denormals and fast FMAF
-// RUN: %clang_cc1 -emit-llvm -o - -triple amdgcn--amdhsa -target-cpu gfx900 %s | FileCheck -check-prefixes=AMDGCN,AMDGCN-DENORM %s
-// RUN: %clang_cc1 -emit-llvm -cl-denorms-are-zero -o - -triple amdgcn--amdhsa -target-cpu gfx900 %s | FileCheck -check-prefixes=AMDGCN,AMDGCN-FLUSH-OPT %s
-
-// RUN: %clang_cc1 -emit-llvm -target-feature +fp32-denormals -target-feature -fp64-fp16-denormals -cl-denorms-are-zero -o - -triple amdgcn--amdhsa -target-cpu fiji %s | FileCheck -check-prefixes=AMDGCN,AMDGCN-FEATURE %s
-// RUN: %clang_cc1 -emit-llvm -target-feature +fp32-denormals -target-feature -fp64-fp16-denormals -cl-denorms-are-zero -o - -triple amdgcn--amdhsa -target-cpu pitcairn %s | FileCheck -check-prefixes=AMDGCN,AMDGCN-FEATURE %s
-
-
-
-// For all targets 'denorms-are-zero' attribute is set to 'true'
-// if '-cl-denorms-are-zero' was specified and to 'false' otherwise.
-
-// CHECK-LABEL: define {{(dso_local )?}}void @f()
-// CHECK: attributes #{{[0-9]*}} = {{{[^}]*}} "denorms-are-zero"="false"
-//
-// DENORM-ZERO-LABEL: define {{(dso_local )?}}void @f()
-// DENORM-ZERO: attributes #{{[0-9]*}} = {{{[^}]*}} "denorms-are-zero"="true"
-
-// For amdgcn target cpu fiji, fp32 should be flushed since fiji does not support fp32 denormals, unless +fp32-denormals is
-// explicitly set. amdgcn target always do not flush fp64 denormals. The control for fp64 and fp16 denormals is the same.
-
-// AMDGCN-LABEL: define void @f()
-
-// AMDGCN-FLUSH: attributes #{{[0-9]*}} = {{{[^}]*}} "denorms-are-zero"="false" {{.*}} "target-features"="{{[^"]*}}+fp64-fp16-denormals,{{[^"]*}}-fp32-denormals{{[^"]*}}"
-// AMDGCN-FLUSH-OPT: attributes #{{[0-9]*}} = {{{[^}]*}} "denorms-are-zero"="true" {{.*}} "target-features"="{{[^"]*}}+fp64-fp16-denormals,{{[^"]*}}-fp32-denormals{{[^"]*}}"
-
-// AMDGCN-DENORM: attributes #{{[0-9]*}} = {{{[^}]*}} "denorms-are-zero"="false" {{.*}} "target-features"="{{[^"]*}}+fp32-denormals,{{[^"]*}}+fp64-fp16-denormals{{[^"]*}}"
-
-// AMDGCN-FEATURE: attributes #{{[0-9]*}} = {{{[^}]*}} "denorms-are-zero"="true" {{.*}} "target-features"="{{[^"]*}}+fp32-denormals,{{[^"]*}}-fp64-fp16-denormals{{[^"]*}}"
-void f() {}
Index: clang/test/CodeGenOpenCL/amdgpu-features.cl
===================================================================
--- clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -14,13 +14,13 @@
// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx600 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX600 %s
// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx601 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX601 %s
-// GFX904: "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+fp32-denormals,+fp64-fp16-denormals,+gfx8-insts,+gfx9-insts,+s-memrealtime"
-// GFX906: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dpp,+flat-address-space,+fp32-denormals,+fp64-fp16-denormals,+gfx8-insts,+gfx9-insts,+s-memrealtime"
-// GFX908: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+fp32-denormals,+fp64-fp16-denormals,+gfx8-insts,+gfx9-insts,+s-memrealtime"
-// GFX1010: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+flat-address-space,+fp32-denormals,+fp64-fp16-denormals,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime"
-// GFX1011: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+fp32-denormals,+fp64-fp16-denormals,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime"
-// GFX1012: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+fp32-denormals,+fp64-fp16-denormals,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime"
-// GFX801: "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+fp32-denormals,+fp64-fp16-denormals,+gfx8-insts,+s-memrealtime"
+// GFX904: "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+fp64-fp16-denormals,+gfx8-insts,+gfx9-insts,+s-memrealtime,-fp32-denormals"
+// GFX906: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dpp,+flat-address-space,+fp64-fp16-denormals,+gfx8-insts,+gfx9-insts,+s-memrealtime,-fp32-denormals"
+// GFX908: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+fp64-fp16-denormals,+gfx8-insts,+gfx9-insts,+s-memrealtime,-fp32-denormals"
+// GFX1010: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+flat-address-space,+fp64-fp16-denormals,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,-fp32-denormals"
+// GFX1011: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+fp64-fp16-denormals,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,-fp32-denormals"
+// GFX1012: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+fp64-fp16-denormals,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,-fp32-denormals"
+// GFX801: "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+fp64-fp16-denormals,+gfx8-insts,+s-memrealtime,-fp32-denormals"
// GFX700: "target-features"="+ci-insts,+flat-address-space,+fp64-fp16-denormals,-fp32-denormals"
// GFX600: "target-features"="+fp64-fp16-denormals,-fp32-denormals"
// GFX601: "target-features"="+fp64-fp16-denormals,-fp32-denormals"
Index: clang/test/CodeGenCUDA/propagate-metadata.cu
===================================================================
--- clang/test/CodeGenCUDA/propagate-metadata.cu
+++ clang/test/CodeGenCUDA/propagate-metadata.cu
@@ -15,17 +15,17 @@
// RUN: %s -o %t.bc -triple nvptx-unknown-unknown
// RUN: %clang_cc1 -x cuda %s -emit-llvm -mlink-builtin-bitcode %t.bc -o - \
-// RUN: -fno-trapping-math -fcuda-is-device -triple nvptx-unknown-unknown \
+// RUN: -fno-trapping-math -fcuda-is-device -fdenormal-fp-math-f32=ieee -triple nvptx-unknown-unknown \
// RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=NOFTZ --check-prefix=NOFAST
// RUN: %clang_cc1 -x cuda %s -emit-llvm -mlink-builtin-bitcode %t.bc \
-// RUN: -fno-trapping-math -fcuda-flush-denormals-to-zero -o - \
+// RUN: -fno-trapping-math -fdenormal-fp-math-f32=preserve-sign -o - \
// RUN: -fcuda-is-device -triple nvptx-unknown-unknown \
// RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=FTZ \
// RUN: --check-prefix=NOFAST
// RUN: %clang_cc1 -x cuda %s -emit-llvm -mlink-builtin-bitcode %t.bc \
-// RUN: -fno-trapping-math -fcuda-flush-denormals-to-zero -o - \
+// RUN: -fno-trapping-math -fdenormal-fp-math-f32=preserve-sign -o - \
// RUN: -fcuda-is-device -menable-unsafe-fp-math -triple nvptx-unknown-unknown \
// RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=FAST
@@ -51,13 +51,20 @@
// CHECK: define void @kernel() [[attr:#[0-9]+]]
// CHECK: define internal void @lib_fn() [[attr]]
+// FIXME: These -NOT checks do not work as intended and do not check on the same
+// line.
+
// Check the attribute list.
// CHECK: attributes [[attr]] = {
+
// CHECK-SAME: convergent
-// CHECK-SAME: "no-trapping-math"="true"
-// FTZ-SAME: "nvptx-f32ftz"="true"
-// NOFTZ-NOT: "nvptx-f32ftz"="true"
+// FTZ-NOT: "denormal-fp-math"
+
+// FTZ-SAME: "denormal-fp-math-f32"="preserve-sign"
+// NOFTZ-SAME: "denormal-fp-math-f32"="ieee"
+
+// CHECK-SAME: "no-trapping-math"="true"
// FAST-SAME: "unsafe-fp-math"="true"
// NOFAST-NOT: "unsafe-fp-math"="true"
Index: clang/test/CodeGenCUDA/flush-denormals.cu
===================================================================
--- clang/test/CodeGenCUDA/flush-denormals.cu
+++ clang/test/CodeGenCUDA/flush-denormals.cu
@@ -1,23 +1,34 @@
// RUN: %clang_cc1 -fcuda-is-device \
// RUN: -triple nvptx-nvidia-cuda -emit-llvm -o - %s | \
-// RUN: FileCheck %s -check-prefix CHECK -check-prefix NOFTZ
-// RUN: %clang_cc1 -fcuda-is-device -fcuda-flush-denormals-to-zero \
+// RUN: FileCheck -check-prefix=DEFAULT %s
+
+// RUN: %clang_cc1 -fcuda-is-device -fdenormal-fp-math-f32=ieee \
+// RUN: -triple nvptx-nvidia-cuda -emit-llvm -o - %s | \
+// RUN: FileCheck -check-prefix=NOFTZ %s
+
+// RUN: %clang_cc1 -fcuda-is-device -fdenormal-fp-math-f32=preserve-sign \
// RUN: -triple nvptx-nvidia-cuda -emit-llvm -o - %s | \
-// RUN: FileCheck %s -check-prefix CHECK -check-prefix FTZ
+// RUN: FileCheck -check-prefix=FTZ %s
+// FIXME: Unspecified should default to ieee
// RUN: %clang_cc1 -fcuda-is-device -x hip \
// RUN: -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm -o - %s | \
-// RUN: FileCheck %s -check-prefix CHECK -check-prefix AMDNOFTZ
-// RUN: %clang_cc1 -fcuda-is-device -x hip -fcuda-flush-denormals-to-zero \
+// RUN: FileCheck -check-prefix=AMDFTZ %s
+
+// RUN: %clang_cc1 -fcuda-is-device -x hip \
+// RUN: -triple amdgcn-amd-amdhsa -target-cpu gfx900 -fdenormal-fp-math-f32=ieee -emit-llvm -o - %s | \
+// RUN: FileCheck -check-prefix=AMDNOFTZ %s
+
+// RUN: %clang_cc1 -fcuda-is-device -x hip -fdenormal-fp-math-f32=preserve-sign \
// RUN: -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm -o - %s | \
-// RUN: FileCheck %s -check-prefix CHECK -check-prefix AMDFTZ
+// RUN: FileCheck -check-prefix=AMDFTZ %s
#include "Inputs/cuda.h"
-// Checks that device function calls get emitted with the "ntpvx-f32ftz"
-// attribute set to "true" when we compile CUDA device code with
-// -fcuda-flush-denormals-to-zero. Further, check that we reflect the presence
-// or absence of -fcuda-flush-denormals-to-zero in a module flag.
+// Checks that device function calls get emitted with the "denormal-fp-math-f32"
+// attribute set when we compile CUDA device code with
+// -fdenormal-fp-math-f32. Further, check that we reflect the presence or
+// absence of -fcuda-flush-denormals-to-zero in a module flag.
// AMDGCN targets always have +fp64-fp16-denormals.
// AMDGCN targets without fast FMAF (e.g. gfx803) always have +fp32-denormals.
@@ -28,8 +39,13 @@
// CHECK-LABEL: define void @foo() #0
extern "C" __device__ void foo() {}
-// FTZ: attributes #0 = {{.*}} "nvptx-f32ftz"="true"
-// NOFTZ-NOT: attributes #0 = {{.*}} "nvptx-f32ftz"
+// FTZ: attributes #0 = {{.*}} "denormal-fp-math-f32"="preserve-sign"
+// NOFTZ: attributes #0 = {{.*}} "denormal-fp-math-f32"="ieee"
+
+
+// FIXME: This should be removed
+// DEFAULT-NOT: "denormal-fp-math-f32"
+
// AMDNOFTZ: attributes #0 = {{.*}}+fp32-denormals{{.*}}+fp64-fp16-denormals
// AMDFTZ: attributes #0 = {{.*}}+fp64-fp16-denormals{{.*}}-fp32-denormals
Index: clang/lib/Frontend/CompilerInvocation.cpp
===================================================================
--- clang/lib/Frontend/CompilerInvocation.cpp
+++ clang/lib/Frontend/CompilerInvocation.cpp
@@ -910,9 +910,6 @@
Args.hasArg(OPT_cl_unsafe_math_optimizations) ||
Args.hasArg(OPT_cl_fast_relaxed_math));
Opts.Reassociate = Args.hasArg(OPT_mreassociate);
- Opts.FlushDenorm = Args.hasArg(OPT_cl_denorms_are_zero) ||
- (Args.hasArg(OPT_fcuda_is_device) &&
- Args.hasArg(OPT_fcuda_flush_denormals_to_zero));
Opts.CorrectlyRoundedDivSqrt =
Args.hasArg(OPT_cl_fp32_correctly_rounded_divide_sqrt);
Opts.UniformWGSize =
@@ -1277,6 +1274,13 @@
Diags.Report(diag::err_drv_invalid_value) << A->getAsString(Args) << Val;
}
+ if (Arg *A = Args.getLastArg(OPT_fdenormal_fp_math_f32_EQ)) {
+ StringRef Val = A->getValue();
+ Opts.FP32DenormalMode = llvm::parseDenormalFPAttribute(Val);
+ if (Opts.FP32DenormalMode == llvm::DenormalMode::Invalid)
+ Diags.Report(diag::err_drv_invalid_value) << A->getAsString(Args) << Val;
+ }
+
if (Arg *A = Args.getLastArg(OPT_fpcc_struct_return, OPT_freg_struct_return)) {
if (A->getOption().matches(OPT_fpcc_struct_return)) {
Opts.setStructReturnConvention(CodeGenOptions::SRCK_OnStack);
Index: clang/lib/Driver/ToolChains/HIP.cpp
===================================================================
--- clang/lib/Driver/ToolChains/HIP.cpp
+++ clang/lib/Driver/ToolChains/HIP.cpp
@@ -295,10 +295,6 @@
CC1Args.push_back(DriverArgs.MakeArgStringRef(GpuArch));
CC1Args.push_back("-fcuda-is-device");
- if (DriverArgs.hasFlag(options::OPT_fcuda_flush_denormals_to_zero,
- options::OPT_fno_cuda_flush_denormals_to_zero, false))
- CC1Args.push_back("-fcuda-flush-denormals-to-zero");
-
if (DriverArgs.hasFlag(options::OPT_fcuda_approx_transcendentals,
options::OPT_fno_cuda_approx_transcendentals, false))
CC1Args.push_back("-fcuda-approx-transcendentals");
Index: clang/lib/Driver/ToolChains/Cuda.h
===================================================================
--- clang/lib/Driver/ToolChains/Cuda.h
+++ clang/lib/Driver/ToolChains/Cuda.h
@@ -149,6 +149,11 @@
llvm::opt::ArgStringList &CC1Args,
Action::OffloadKind DeviceOffloadKind) const override;
+ llvm::DenormalMode getDefaultDenormalModeForType(
+ const llvm::opt::ArgList &DriverArgs,
+ Action::OffloadKind DeviceOffloadKind,
+ const llvm::fltSemantics *FPType = nullptr) const override;
+
// Never try to use the integrated assembler with CUDA; always fork out to
// ptxas.
bool useIntegratedAs() const override { return false; }
Index: clang/lib/Driver/ToolChains/Cuda.cpp
===================================================================
--- clang/lib/Driver/ToolChains/Cuda.cpp
+++ clang/lib/Driver/ToolChains/Cuda.cpp
@@ -21,6 +21,7 @@
#include "llvm/Support/Path.h"
#include "llvm/Support/Process.h"
#include "llvm/Support/Program.h"
+#include "llvm/Support/TargetParser.h"
#include "llvm/Support/VirtualFileSystem.h"
#include <system_error>
@@ -614,10 +615,6 @@
if (DeviceOffloadingKind == Action::OFK_Cuda) {
CC1Args.push_back("-fcuda-is-device");
- if (DriverArgs.hasFlag(options::OPT_fcuda_flush_denormals_to_zero,
- options::OPT_fno_cuda_flush_denormals_to_zero, false))
- CC1Args.push_back("-fcuda-flush-denormals-to-zero");
-
if (DriverArgs.hasFlag(options::OPT_fcuda_approx_transcendentals,
options::OPT_fno_cuda_approx_transcendentals, false))
CC1Args.push_back("-fcuda-approx-transcendentals");
@@ -718,6 +715,21 @@
}
}
+llvm::DenormalMode CudaToolChain::getDefaultDenormalModeForType(
+ const llvm::opt::ArgList &DriverArgs, Action::OffloadKind DeviceOffloadKind,
+ const llvm::fltSemantics *FPType) const {
+ if (DeviceOffloadKind == Action::OFK_Cuda) {
+ if (FPType && FPType == &llvm::APFloat::IEEEsingle() &&
+ DriverArgs.hasFlag(options::OPT_fcuda_flush_denormals_to_zero,
+ options::OPT_fno_cuda_flush_denormals_to_zero,
+ false))
+ return llvm::DenormalMode::PreserveSign;
+ }
+
+ assert(DeviceOffloadKind != Action::OFK_Host);
+ return llvm::DenormalMode::IEEE;
+}
+
bool CudaToolChain::supportsDebugInfoOption(const llvm::opt::Arg *A) const {
const Option &O = A->getOption();
return (O.matches(options::OPT_gN_Group) &&
Index: clang/lib/Driver/ToolChains/Clang.cpp
===================================================================
--- clang/lib/Driver/ToolChains/Clang.cpp
+++ clang/lib/Driver/ToolChains/Clang.cpp
@@ -2480,7 +2480,8 @@
static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
bool OFastEnabled, const ArgList &Args,
- ArgStringList &CmdArgs) {
+ ArgStringList &CmdArgs,
+ Action::OffloadKind DeviceOffloadKind) {
// Handle various floating point optimization flags, mapping them to the
// appropriate LLVM code generation flags. This is complicated by several
// "umbrella" flags, so we do this by stepping through the flags incrementally
@@ -2502,10 +2503,18 @@
StringRef FPModel = "";
// -ffp-exception-behavior options: strict, maytrap, ignore
StringRef FPExceptionBehavior = "";
- StringRef DenormalFPMath = "";
+ const llvm::DenormalMode DefaultDenormalFPMath =
+ TC.getDefaultDenormalModeForType(Args, DeviceOffloadKind);
+ const llvm::DenormalMode DefaultDenormalFP32Math =
+ TC.getDefaultDenormalModeForType(Args, DeviceOffloadKind,
+ &llvm::APFloat::IEEEsingle());
+
+ llvm::DenormalMode DenormalFPMath = DefaultDenormalFPMath;
+ llvm::DenormalMode DenormalFP32Math = DefaultDenormalFP32Math;
StringRef FPContract = "";
bool StrictFPModel = false;
+
if (const Arg *A = Args.getLastArg(options::OPT_flimited_precision_EQ)) {
CmdArgs.push_back("-mlimit-float-precision");
CmdArgs.push_back(A->getValue());
@@ -2527,7 +2536,7 @@
ReciprocalMath = false;
SignedZeros = true;
// -fno_fast_math restores default denormal and fpcontract handling
- DenormalFPMath = "";
+ DenormalFPMath = DefaultDenormalFPMath;
FPContract = "";
StringRef Val = A->getValue();
if (OFastEnabled && !Val.equals("fast")) {
@@ -2621,7 +2630,19 @@
break;
case options::OPT_fdenormal_fp_math_EQ:
- DenormalFPMath = A->getValue();
+ DenormalFPMath = llvm::parseDenormalFPAttribute(A->getValue());
+ if (DenormalFPMath == llvm::DenormalMode::Invalid) {
+ D.Diag(diag::err_drv_invalid_value)
+ << A->getAsString(Args) << A->getValue();
+ }
+ break;
+
+ case options::OPT_fdenormal_fp_math_f32_EQ:
+ DenormalFP32Math = llvm::parseDenormalFPAttribute(A->getValue());
+ if (DenormalFP32Math == llvm::DenormalMode::Invalid) {
+ D.Diag(diag::err_drv_invalid_value)
+ << A->getAsString(Args) << A->getValue();
+ }
break;
// Validate and pass through -ffp-contract option.
@@ -2691,7 +2712,8 @@
TrappingMath = true;
FPExceptionBehavior = "strict";
// -fno_unsafe_math_optimizations restores default denormal handling
- DenormalFPMath = "";
+ DenormalFPMath = DefaultDenormalFPMath;
+ DenormalFP32Math = DefaultDenormalFP32Math;
break;
case options::OPT_Ofast:
@@ -2724,17 +2746,20 @@
TrappingMath = false;
RoundingFPMath = false;
// -fno_fast_math restores default denormal and fpcontract handling
- DenormalFPMath = "";
+ DenormalFPMath = DefaultDenormalFPMath;
+ DenormalFP32Math = DefaultDenormalFP32Math;
FPContract = "";
break;
}
if (StrictFPModel) {
// If -ffp-model=strict has been specified on command line but
// subsequent options conflict then emit warning diagnostic.
+ // TODO: How should this interact with DenormalFP32Math?
if (HonorINFs && HonorNaNs &&
!AssociativeMath && !ReciprocalMath &&
SignedZeros && TrappingMath && RoundingFPMath &&
- DenormalFPMath.empty() && FPContract.empty())
+ DenormalFPMath != llvm::DenormalMode::IEEE &&
+ FPContract.empty())
// OK: Current Arg doesn't conflict with -ffp-model=strict
;
else {
@@ -2780,9 +2805,16 @@
} else if (TrappingMathPresent)
CmdArgs.push_back("-fno-trapping-math");
- if (!DenormalFPMath.empty())
- CmdArgs.push_back(
- Args.MakeArgString("-fdenormal-fp-math=" + DenormalFPMath));
+ // TODO: Omit flag for the default IEEE instead
+ if (DenormalFPMath != llvm::DenormalMode::Invalid) {
+ CmdArgs.push_back(Args.MakeArgString(
+ "-fdenormal-fp-math=" + llvm::denormalModeName(DenormalFPMath)));
+ }
+
+ if (DenormalFP32Math != llvm::DenormalMode::Invalid) {
+ CmdArgs.push_back(Args.MakeArgString(
+ "-fdenormal-fp-math-f32=" + llvm::denormalModeName(DenormalFP32Math)));
+ }
if (!FPContract.empty())
CmdArgs.push_back(Args.MakeArgString("-ffp-contract=" + FPContract));
@@ -3002,6 +3034,8 @@
}
static void RenderOpenCLOptions(const ArgList &Args, ArgStringList &CmdArgs) {
+ // cl-denorms-are-zero is not forwarded. It is translated into a generic flag
+ // for denormal flushing handling based on the target.
const unsigned ForwardedArguments[] = {
options::OPT_cl_opt_disable,
options::OPT_cl_strict_aliasing,
@@ -3012,7 +3046,6 @@
options::OPT_cl_fast_relaxed_math,
options::OPT_cl_mad_enable,
options::OPT_cl_no_signed_zeros,
- options::OPT_cl_denorms_are_zero,
options::OPT_cl_fp32_correctly_rounded_divide_sqrt,
options::OPT_cl_uniform_work_group_size
};
@@ -4195,7 +4228,7 @@
CmdArgs.push_back("-mdisable-tail-calls");
RenderFloatingPointOptions(TC, D, isOptimizationLevelFast(Args), Args,
- CmdArgs);
+ CmdArgs, JA.getOffloadingDeviceKind());
// Render ABI arguments
switch (TC.getArch()) {
@@ -4495,7 +4528,8 @@
if (Args.hasArg(options::OPT_fsplit_stack))
CmdArgs.push_back("-split-stacks");
- RenderFloatingPointOptions(TC, D, OFastEnabled, Args, CmdArgs);
+ RenderFloatingPointOptions(TC, D, OFastEnabled, Args, CmdArgs,
+ JA.getOffloadingDeviceKind());
if (Arg *A = Args.getLastArg(options::OPT_LongDouble_Group)) {
if (TC.getTriple().isX86())
Index: clang/lib/Driver/ToolChains/AMDGPU.h
===================================================================
--- clang/lib/Driver/ToolChains/AMDGPU.h
+++ clang/lib/Driver/ToolChains/AMDGPU.h
@@ -66,6 +66,11 @@
void addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
llvm::opt::ArgStringList &CC1Args,
Action::OffloadKind DeviceOffloadKind) const override;
+
+ llvm::DenormalMode getDefaultDenormalModeForType(
+ const llvm::opt::ArgList &DriverArgs,
+ Action::OffloadKind DeviceOffloadKind,
+ const llvm::fltSemantics *FPType = nullptr) const override;
};
} // end namespace toolchains
Index: clang/lib/Driver/ToolChains/AMDGPU.cpp
===================================================================
--- clang/lib/Driver/ToolChains/AMDGPU.cpp
+++ clang/lib/Driver/ToolChains/AMDGPU.cpp
@@ -12,6 +12,7 @@
#include "clang/Driver/Compilation.h"
#include "clang/Driver/DriverDiagnostic.h"
#include "llvm/Option/ArgList.h"
+#include "llvm/Support/TargetParser.h"
using namespace clang::driver;
using namespace clang::driver::tools;
@@ -102,6 +103,40 @@
return DAL;
}
+llvm::DenormalMode AMDGPUToolChain::getDefaultDenormalModeForType(
+ const llvm::opt::ArgList &DriverArgs, Action::OffloadKind DeviceOffloadKind,
+ const llvm::fltSemantics *FPType) const {
+ // Denormals should always be enabled for f16 and f64.
+ if (!FPType || FPType != &llvm::APFloat::IEEEsingle())
+ return llvm::DenormalMode::IEEE;
+
+ if (DeviceOffloadKind == Action::OFK_Cuda) {
+ if (FPType && FPType == &llvm::APFloat::IEEEsingle() &&
+ DriverArgs.hasFlag(options::OPT_fcuda_flush_denormals_to_zero,
+ options::OPT_fno_cuda_flush_denormals_to_zero,
+ false))
+ return llvm::DenormalMode::PreserveSign;
+ }
+
+ const StringRef GpuArch = DriverArgs.getLastArgValue(options::OPT_mcpu_EQ);
+ auto Kind = llvm::AMDGPU::parseArchAMDGCN(GpuArch);
+
+ // Default to enabling f32 denormals by default on subtargets where fma is
+ // fast with denormals
+
+ const unsigned ArchAttr = llvm::AMDGPU::getArchAttrAMDGCN(Kind);
+ const bool DefaultDenormsAreZeroForTarget =
+ (ArchAttr & llvm::AMDGPU::FEATURE_FAST_FMA_F32) &&
+ (ArchAttr & llvm::AMDGPU::FEATURE_FAST_DENORMAL_F32);
+
+ // TODO: There are way too many flags that change this. Do we need to check
+ // them all?
+ bool DAZ = DriverArgs.hasArg(options::OPT_cl_denorms_are_zero) ||
+ !DefaultDenormsAreZeroForTarget;
+ // Outputs are flushed to zero, preserving sign
+ return DAZ ? llvm::DenormalMode::PreserveSign : llvm::DenormalMode::IEEE;
+}
+
void AMDGPUToolChain::addClangTargetOptions(
const llvm::opt::ArgList &DriverArgs,
llvm::opt::ArgStringList &CC1Args,
Index: clang/lib/CodeGen/CodeGenModule.cpp
===================================================================
--- clang/lib/CodeGen/CodeGenModule.cpp
+++ clang/lib/CodeGen/CodeGenModule.cpp
@@ -567,7 +567,8 @@
// floating point values to 0. (This corresponds to its "__CUDA_FTZ"
// property.)
getModule().addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz",
- CodeGenOpts.FlushDenorm ? 1 : 0);
+ CodeGenOpts.FP32DenormalMode !=
+ llvm::DenormalMode::IEEE);
}
// Emit OpenCL specific module metadata: OpenCL/SPIR version.
Index: clang/lib/CodeGen/CGCall.cpp
===================================================================
--- clang/lib/CodeGen/CGCall.cpp
+++ clang/lib/CodeGen/CGCall.cpp
@@ -1747,10 +1747,17 @@
if (CodeGenOpts.NullPointerIsValid)
FuncAttrs.addAttribute("null-pointer-is-valid", "true");
+
+ // TODO: Omit attribute when the default is IEEE.
if (CodeGenOpts.FPDenormalMode != llvm::DenormalMode::Invalid)
FuncAttrs.addAttribute("denormal-fp-math",
llvm::denormalModeName(CodeGenOpts.FPDenormalMode));
+ if (CodeGenOpts.FP32DenormalMode != llvm::DenormalMode::Invalid)
+ FuncAttrs.addAttribute(
+ "denormal-fp-math-f32",
+ llvm::denormalModeName(CodeGenOpts.FP32DenormalMode));
+
FuncAttrs.addAttribute("no-trapping-math",
llvm::toStringRef(CodeGenOpts.NoTrappingMath));
@@ -1777,10 +1784,6 @@
"correctly-rounded-divide-sqrt-fp-math",
llvm::toStringRef(CodeGenOpts.CorrectlyRoundedDivSqrt));
- if (getLangOpts().OpenCL)
- FuncAttrs.addAttribute("denorms-are-zero",
- llvm::toStringRef(CodeGenOpts.FlushDenorm));
-
// TODO: Reciprocal estimate codegen options should apply to instructions?
const std::vector<std::string> &Recips = CodeGenOpts.Reciprocals;
if (!Recips.empty())
@@ -1813,10 +1816,6 @@
if (getLangOpts().CUDA && getLangOpts().CUDAIsDevice) {
// Exceptions aren't supported in CUDA device code.
FuncAttrs.addAttribute(llvm::Attribute::NoUnwind);
-
- // Respect -fcuda-flush-denormals-to-zero.
- if (CodeGenOpts.FlushDenorm)
- FuncAttrs.addAttribute("nvptx-f32ftz", "true");
}
for (StringRef Attr : CodeGenOpts.DefaultFunctionAttrs) {
Index: clang/lib/Basic/Targets/AMDGPU.cpp
===================================================================
--- clang/lib/Basic/Targets/AMDGPU.cpp
+++ clang/lib/Basic/Targets/AMDGPU.cpp
@@ -241,7 +241,8 @@
}
if (!hasFP32Denormals)
TargetOpts.Features.push_back(
- (Twine(hasFastFMAF() && hasFullRateDenormalsF32() && !CGOpts.FlushDenorm
+ (Twine(hasFastFMAF() && hasFullRateDenormalsF32() &&
+ CGOpts.FP32DenormalMode == llvm::DenormalMode::IEEE
? '+' : '-') + Twine("fp32-denormals"))
.str());
// Always do not flush fp64 or fp16 denorms.
Index: clang/include/clang/Driver/ToolChain.h
===================================================================
--- clang/include/clang/Driver/ToolChain.h
+++ clang/include/clang/Driver/ToolChain.h
@@ -16,7 +16,9 @@
#include "clang/Driver/Action.h"
#include "clang/Driver/Multilib.h"
#include "clang/Driver/Types.h"
+#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/FloatingPointMode.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Triple.h"
@@ -606,6 +608,17 @@
/// Returns true when it's possible to split LTO unit to use whole
/// program devirtualization and CFI santiizers.
virtual bool canSplitThinLTOUnit() const { return true; }
+
+ /// Returns the output denormal handling type in the default floating point
+ /// environment for the given \p FPType if given. Otherwise, the default
+ /// assumed mode for any floating point type.
+ virtual llvm::DenormalMode getDefaultDenormalModeForType(
+ const llvm::opt::ArgList &DriverArgs,
+ Action::OffloadKind DeviceOffloadKind,
+ const llvm::fltSemantics *FPType = nullptr) const {
+ // FIXME: This should be IEEE when default handling is fixed.
+ return llvm::DenormalMode::Invalid;
+ }
};
/// Set a ToolChain's effective triple. Reset it when the registration object
Index: clang/include/clang/Driver/Options.td
===================================================================
--- clang/include/clang/Driver/Options.td
+++ clang/include/clang/Driver/Options.td
@@ -523,7 +523,7 @@
HelpText<"OpenCL only. Allow use of less precise no signed zeros computations in the generated binary.">;
def cl_std_EQ : Joined<["-"], "cl-std=">, Group<opencl_Group>, Flags<[CC1Option]>,
HelpText<"OpenCL language standard to compile for.">, Values<"cl,CL,cl1.1,CL1.1,cl1.2,CL1.2,cl2.0,CL2.0,clc++,CLC++">;
-def cl_denorms_are_zero : Flag<["-"], "cl-denorms-are-zero">, Group<opencl_Group>, Flags<[CC1Option]>,
+def cl_denorms_are_zero : Flag<["-"], "cl-denorms-are-zero">, Group<opencl_Group>,
HelpText<"OpenCL only. Allow denormals to be flushed to zero.">;
def cl_fp32_correctly_rounded_divide_sqrt : Flag<["-"], "cl-fp32-correctly-rounded-divide-sqrt">, Group<opencl_Group>, Flags<[CC1Option]>,
HelpText<"OpenCL only. Specify that single precision floating-point divide and sqrt used in the program source are correctly rounded.">;
@@ -581,7 +581,7 @@
def ptxas_path_EQ : Joined<["--"], "ptxas-path=">, Group<i_Group>,
HelpText<"Path to ptxas (used for compiling CUDA code)">;
def fcuda_flush_denormals_to_zero : Flag<["-"], "fcuda-flush-denormals-to-zero">,
- Flags<[CC1Option]>, HelpText<"Flush denormal floating point values to zero in CUDA device mode.">;
+ HelpText<"Flush denormal floating point values to zero in CUDA device mode.">;
def fno_cuda_flush_denormals_to_zero : Flag<["-"], "fno-cuda-flush-denormals-to-zero">;
def fcuda_approx_transcendentals : Flag<["-"], "fcuda-approx-transcendentals">,
Flags<[CC1Option]>, HelpText<"Use approximate transcendental functions">;
Index: clang/include/clang/Driver/CC1Options.td
===================================================================
--- clang/include/clang/Driver/CC1Options.td
+++ clang/include/clang/Driver/CC1Options.td
@@ -405,6 +405,9 @@
def cfguard : Flag<["-"], "cfguard">,
HelpText<"Emit Windows Control Flow Guard tables and checks">;
+def fdenormal_fp_math_f32_EQ : Joined<["-"], "fdenormal-fp-math-f32=">,
+ Group<f_Group>;
+
//===----------------------------------------------------------------------===//
// Dependency Output Options
//===----------------------------------------------------------------------===//
Index: clang/include/clang/Basic/CodeGenOptions.h
===================================================================
--- clang/include/clang/Basic/CodeGenOptions.h
+++ clang/include/clang/Basic/CodeGenOptions.h
@@ -166,6 +166,9 @@
/// The floating-point denormal mode to use.
llvm::DenormalMode FPDenormalMode = llvm::DenormalMode::Invalid;
+ /// The floating-point subnormal mode to use, for float.
+ llvm::DenormalMode FP32DenormalMode = llvm::DenormalMode::Invalid;
+
/// The float precision limit to use, if non-empty.
std::string LimitFloatPrecision;
Index: clang/include/clang/Basic/CodeGenOptions.def
===================================================================
--- clang/include/clang/Basic/CodeGenOptions.def
+++ clang/include/clang/Basic/CodeGenOptions.def
@@ -153,7 +153,6 @@
CODEGENOPT(ReciprocalMath , 1, 0) ///< Allow FP divisions to be reassociated.
CODEGENOPT(NoTrappingMath , 1, 0) ///< Set when -fno-trapping-math is enabled.
CODEGENOPT(NoNaNsFPMath , 1, 0) ///< Assume FP arguments, results not NaN.
-CODEGENOPT(FlushDenorm , 1, 0) ///< Allow FP denorm numbers to be flushed to zero
CODEGENOPT(CorrectlyRoundedDivSqrt, 1, 0) ///< -cl-fp32-correctly-rounded-divide-sqrt
/// When false, this attempts to generate code as if the result of an
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits