arsenm updated this revision to Diff 537737.
arsenm added a comment.

Split div/sqrt handling since they have different values. Also cuda does have 
unimplemented flags to control these individually. Not sure it's worth trying 
to merge them into one function


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D154495/new/

https://reviews.llvm.org/D154495

Files:
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/lib/CodeGen/CGExpr.cpp
  clang/lib/CodeGen/CGExprScalar.cpp
  clang/lib/CodeGen/CodeGenFunction.h
  clang/test/CodeGenCUDA/correctly-rounded-div.cu
  clang/test/CodeGenOpenCL/fpmath.cl

Index: clang/test/CodeGenOpenCL/fpmath.cl
===================================================================
--- clang/test/CodeGenOpenCL/fpmath.cl
+++ clang/test/CodeGenOpenCL/fpmath.cl
@@ -8,7 +8,7 @@
 float spscalardiv(float a, float b) {
   // CHECK: @spscalardiv
   // CHECK: fdiv{{.*}},
-  // NODIVOPT: !fpmath ![[MD:[0-9]+]]
+  // NODIVOPT: !fpmath ![[MD_FDIV:[0-9]+]]
   // DIVOPT-NOT: !fpmath !{{[0-9]+}}
   return a / b;
 }
@@ -16,11 +16,18 @@
 float4 spvectordiv(float4 a, float4 b) {
   // CHECK: @spvectordiv
   // CHECK: fdiv{{.*}},
-  // NODIVOPT: !fpmath ![[MD]]
+  // NODIVOPT: !fpmath ![[MD_FDIV]]
   // DIVOPT-NOT: !fpmath !{{[0-9]+}}
   return a / b;
 }
 
+float spscalarsqrt(float a) {
+  // CHECK-LABEL: @spscalarsqrt
+  // NODIVOPT: call float @llvm.sqrt.f32(float %{{.+}}), !fpmath ![[MD_SQRT:[0-9]+]]
+  // DIVOPT: call float @llvm.sqrt.f32(float %{{.+}}){{$}}
+  return __builtin_sqrtf(a);
+}
+
 #if __OPENCL_C_VERSION__ >=120
 void printf(constant char* fmt, ...);
 
@@ -34,11 +41,27 @@
 
 #ifndef NOFP64
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+typedef __attribute__(( ext_vector_type(4) )) double double4;
+
 double dpscalardiv(double a, double b) {
   // CHECK: @dpscalardiv
   // CHECK-NOT: !fpmath
   return a / b;
 }
+
+double4 dpvectordiv(double4 a, double4 b) {
+  // CHECK: @dpvectordiv
+  // CHECK-NOT: !fpmath
+  return a / b;
+}
+
+double dpscalarsqrt(double a) {
+  // CHECK-LABEL: @dpscalarsqrt
+  // CHECK: call double @llvm.sqrt.f64(double %{{.+}}){{$}}
+  return __builtin_sqrt(a);
+}
+
 #endif
 
-// NODIVOPT: ![[MD]] = !{float 2.500000e+00}
+// NODIVOPT: ![[MD_FDIV]] = !{float 2.500000e+00}
+// NODIVOPT: ![[MD_SQRT]] = !{float 3.000000e+00}
Index: clang/test/CodeGenCUDA/correctly-rounded-div.cu
===================================================================
--- clang/test/CodeGenCUDA/correctly-rounded-div.cu
+++ clang/test/CodeGenCUDA/correctly-rounded-div.cu
@@ -32,4 +32,18 @@
   return a / b;
 }
 
-// NCRDIV: ![[MD]] = !{float 2.500000e+00}
+// COMMON-LABEL: @_Z12spscalarsqrt
+// NCRDIV: call contract float @llvm.sqrt.f32(float %{{.+}}), !fpmath ![[MD:[0-9]+]]
+// CRDIV: call contract float @llvm.sqrt.f32(float %{{.+}}){{$}}
+__device__ float spscalarsqrt(float a) {
+  return __builtin_sqrtf(a);
+}
+
+// COMMON-LABEL: @_Z12dpscalarsqrt
+// COMMON: call contract double @llvm.sqrt.f64(double %{{.+}}){{$}}
+// COMMON-NOT: !fpmath
+__device__ double dpscalarsqrt(double a) {
+  return __builtin_sqrt(a);
+}
+
+// NCRSQRT: ![[MD]] = !{float 2.500000e+00}
Index: clang/lib/CodeGen/CodeGenFunction.h
===================================================================
--- clang/lib/CodeGen/CodeGenFunction.h
+++ clang/lib/CodeGen/CodeGenFunction.h
@@ -4704,6 +4704,14 @@
   /// point operation, expressed as the maximum relative error in ulp.
   void SetFPAccuracy(llvm::Value *Val, float Accuracy);
 
+  /// Set the minimum required accuracy of the given sqrt operation
+  /// based on CodeGenOpts.
+  void SetSqrtFPAccuracy(llvm::Value *Val);
+
+  /// Set the minimum required accuracy of the given sqrt operation based on
+  /// CodeGenOpts.
+  void SetDivFPAccuracy(llvm::Value *Val);
+
   /// Set the codegen fast-math flags.
   void SetFastMathFlags(FPOptions FPFeatures);
 
Index: clang/lib/CodeGen/CGExprScalar.cpp
===================================================================
--- clang/lib/CodeGen/CGExprScalar.cpp
+++ clang/lib/CodeGen/CGExprScalar.cpp
@@ -3478,21 +3478,7 @@
     llvm::Value *Val;
     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, Ops.FPFeatures);
     Val = Builder.CreateFDiv(Ops.LHS, Ops.RHS, "div");
-    if ((CGF.getLangOpts().OpenCL &&
-         !CGF.CGM.getCodeGenOpts().OpenCLCorrectlyRoundedDivSqrt) ||
-        (CGF.getLangOpts().HIP && CGF.getLangOpts().CUDAIsDevice &&
-         !CGF.CGM.getCodeGenOpts().HIPCorrectlyRoundedDivSqrt)) {
-      // OpenCL v1.1 s7.4: minimum accuracy of single precision / is 2.5ulp
-      // OpenCL v1.2 s5.6.4.2: The -cl-fp32-correctly-rounded-divide-sqrt
-      // build option allows an application to specify that single precision
-      // floating-point divide (x/y and 1/x) and sqrt used in the program
-      // source are correctly rounded.
-      llvm::Type *ValTy = Val->getType();
-      if (ValTy->isFloatTy() ||
-          (isa<llvm::VectorType>(ValTy) &&
-           cast<llvm::VectorType>(ValTy)->getElementType()->isFloatTy()))
-        CGF.SetFPAccuracy(Val, 2.5);
-    }
+    CGF.SetDivFPAccuracy(Val);
     return Val;
   }
   else if (Ops.isFixedPointOp())
Index: clang/lib/CodeGen/CGExpr.cpp
===================================================================
--- clang/lib/CodeGen/CGExpr.cpp
+++ clang/lib/CodeGen/CGExpr.cpp
@@ -5577,6 +5577,48 @@
   cast<llvm::Instruction>(Val)->setMetadata(llvm::LLVMContext::MD_fpmath, Node);
 }
 
+void CodeGenFunction::SetSqrtFPAccuracy(llvm::Value *Val) {
+  llvm::Type *EltTy = Val->getType()->getScalarType();
+  if (!EltTy->isFloatTy())
+    return;
+
+  if ((getLangOpts().OpenCL &&
+       !CGM.getCodeGenOpts().OpenCLCorrectlyRoundedDivSqrt) ||
+      (getLangOpts().HIP && getLangOpts().CUDAIsDevice &&
+       !CGM.getCodeGenOpts().HIPCorrectlyRoundedDivSqrt)) {
+    // OpenCL v1.1 s7.4: minimum accuracy of single precision / is 3ulp
+    //
+    // OpenCL v1.2 s5.6.4.2: The -cl-fp32-correctly-rounded-divide-sqrt
+    // build option allows an application to specify that single precision
+    // floating-point divide (x/y and 1/x) and sqrt used in the program
+    // source are correctly rounded.
+    //
+    // TODO: CUDA has a prec-sqrt flag
+    SetFPAccuracy(Val, 3.0f);
+  }
+}
+
+void CodeGenFunction::SetDivFPAccuracy(llvm::Value *Val) {
+  llvm::Type *EltTy = Val->getType()->getScalarType();
+  if (!EltTy->isFloatTy())
+    return;
+
+  if ((getLangOpts().OpenCL &&
+       !CGM.getCodeGenOpts().OpenCLCorrectlyRoundedDivSqrt) ||
+      (getLangOpts().HIP && getLangOpts().CUDAIsDevice &&
+       !CGM.getCodeGenOpts().HIPCorrectlyRoundedDivSqrt)) {
+    // OpenCL v1.1 s7.4: minimum accuracy of single precision / is 2.5ulp
+    //
+    // OpenCL v1.2 s5.6.4.2: The -cl-fp32-correctly-rounded-divide-sqrt
+    // build option allows an application to specify that single precision
+    // floating-point divide (x/y and 1/x) and sqrt used in the program
+    // source are correctly rounded.
+    //
+    // TODO: CUDA has a prec-div flag
+    SetFPAccuracy(Val, 2.5f);
+  }
+}
+
 namespace {
   struct LValueOrRValue {
     LValue LV;
Index: clang/lib/CodeGen/CGBuiltin.cpp
===================================================================
--- clang/lib/CodeGen/CGBuiltin.cpp
+++ clang/lib/CodeGen/CGBuiltin.cpp
@@ -2532,11 +2532,12 @@
     case Builtin::BI__builtin_sqrtf:
     case Builtin::BI__builtin_sqrtf16:
     case Builtin::BI__builtin_sqrtl:
-    case Builtin::BI__builtin_sqrtf128:
-      return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
-                                   Intrinsic::sqrt,
-                                   Intrinsic::experimental_constrained_sqrt));
-
+    case Builtin::BI__builtin_sqrtf128: {
+      llvm::Value *Call = emitUnaryMaybeConstrainedFPBuiltin(
+          *this, E, Intrinsic::sqrt, Intrinsic::experimental_constrained_sqrt);
+      SetSqrtFPAccuracy(Call);
+      return RValue::get(Call);
+    }
     case Builtin::BItrunc:
     case Builtin::BItruncf:
     case Builtin::BItruncl:
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to