vikramRH created this revision.
vikramRH added reviewers: sameerds, b-sumner, yaxunl.
vikramRH added a project: LLVM.
Herald added subscribers: kosarev, foad, kerbowa, hiraditya, Anastasia, 
jvesely, arsenm.
Herald added a project: All.
vikramRH requested review of this revision.
Herald added subscribers: llvm-commits, cfe-commits, MaskRay.
Herald added a project: clang.

The patch essentially ports the existing non-hostcall printf support in OpenCL 
to HIP (to be controlled via new option "-fdelayed-printf"), with following 
changes

1. Code refactoring -> we now use API's "getConstantStringInfo()" to extract 
constant string contents at compile time.
2. Support to print non-const null terminated strings, required in HIP context. 
This is achieved as follows
  - calculate string size using a function "getStrlenWithNull()" and reserve 
the space in printf buffer using __printf_alloc() (as was the case in OpenCL, 
but number of bytes allocated could be dynamic now)
  - copy the string contents to buffer using previously calculated size and the 
pointer to string (a memcpy intrinsic is generated here)


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D138702

Files:
  clang/include/clang/Basic/LangOptions.def
  clang/include/clang/Driver/Options.td
  clang/lib/Basic/Builtins.cpp
  clang/lib/Driver/ToolChains/Clang.cpp
  llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
  llvm/test/CodeGen/AMDGPU/hip-delayed-printf.ll
  llvm/test/CodeGen/AMDGPU/opencl-printf.ll

Index: llvm/test/CodeGen/AMDGPU/opencl-printf.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/opencl-printf.ll
+++ llvm/test/CodeGen/AMDGPU/opencl-printf.ll
@@ -9,18 +9,17 @@
 ; R600: call i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([6 x i8], [6 x i8] addrspace(2)* @.str, i32 0, i32 0), i8* %arraydecay, i32 %n)
 ; GCN-LABEL: entry
 ; GCN: call i8 addrspace(1)* @__printf_alloc
-; GCN-LABEL: entry.split
+; GCN-LABEL: strlen.join.split
 ; GCN: icmp ne i8 addrspace(1)* %printf_alloc_fn, null
+; GCN: br i1 %14, label %15, label %16
 ; GCN: %PrintBuffID = getelementptr i8, i8 addrspace(1)* %printf_alloc_fn, i32 0
 ; GCN: %PrintBuffIdCast = bitcast i8 addrspace(1)* %PrintBuffID to i32 addrspace(1)*
-; GCN: store i32 1, i32 addrspace(1)* %PrintBuffIdCast
+; GCN: store i32 1, i32 addrspace(1)* %PrintBuffIdCast, align 4
 ; GCN: %PrintBuffGep = getelementptr i8, i8 addrspace(1)* %printf_alloc_fn, i32 4
-; GCN: %PrintArgPtr = ptrtoint i8* %arraydecay to i64
-; GCN: %PrintBuffPtrCast = bitcast i8 addrspace(1)* %PrintBuffGep to i64 addrspace(1)*
-; GCN: store i64 %PrintArgPtr, i64 addrspace(1)* %PrintBuffPtrCast
-; GCN: %PrintBuffNextPtr = getelementptr i8, i8 addrspace(1)* %PrintBuffGep, i32 8
-; GCN: %PrintBuffPtrCast1 = bitcast i8 addrspace(1)* %PrintBuffNextPtr to i32 addrspace(1)*
-; GCN: store i32 %n, i32 addrspace(1)* %PrintBuffPtrCast1
+; GCN: call void @llvm.memcpy.p1i8.p0i8.i64(i8 addrspace(1)* %PrintBuffGep, i8* %arraydecay, i64 %9, i1 false)
+; GCN: %PrintBuffNextPtr = getelementptr i8, i8 addrspace(1)* %PrintBuffGep, i64 %11
+; GCN: %PrintBuffPtrCast = bitcast i8 addrspace(1)* %PrintBuffNextPtr to i32 addrspace(1)*
+; GCN: store i32 %n, i32 addrspace(1)* %PrintBuffPtrCast, align 4
 
 @.str = private unnamed_addr addrspace(2) constant [6 x i8] c"%s:%d\00", align 1
 
Index: llvm/test/CodeGen/AMDGPU/hip-delayed-printf.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AMDGPU/hip-delayed-printf.ll
@@ -0,0 +1,57 @@
+; RUN: opt -mtriple=amdgcn--amdhsa -passes=amdgpu-printf-runtime-binding -S < %s | FileCheck --check-prefix=FUNC --check-prefix=GCN --check-prefix=METADATA %s
+
+; FUNC-LABEL: @test_kernel(
+; GCN-LABEL: entry
+; GCN-LABEL: strlen.while
+; GCN: br i1 %6, label %strlen.while.done, label %strlen.while
+; GCN-LABEL: strlen.join
+; GCN: %12 = add i64 %11, 3
+; GCN: %13 = and i64 %12, 4294967292
+; GCN: %14 = add i64 %13, 4
+; GCN: %15 = trunc i64 %14 to i32
+; GCN: %printf_alloc_fn = call ptr addrspace(1) @__printf_alloc(i32 %15)
+; GCN-LABEL: strlen.join.split
+; GCN: %16 = icmp ne ptr addrspace(1) %printf_alloc_fn, null
+; GCN: br i1 %16, label %17, label %18
+; GCN: %PrintBuffID = getelementptr i8, ptr addrspace(1) %printf_alloc_fn, i32 0
+; GCN: %PrintBuffIdCast = bitcast ptr addrspace(1) %PrintBuffID to ptr addrspace(1)
+; GCN: store i32 1, ptr addrspace(1) %PrintBuffIdCast, align 4
+; GCN: %PrintBuffGep = getelementptr i8, ptr addrspace(1) %printf_alloc_fn, i32 4
+; GCN: call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) %PrintBuffGep, ptr %1, i64 %11, i1 false)
+; GCN: %PrintBuffNextPtr = getelementptr i8, ptr addrspace(1) %PrintBuffGep, i64 %13
+; GCN: br label %18
+
+; METADATA: !llvm.printf.fmts = !{!0}
+; METADATA: !0 = !{!"1:1:8:%s"}
+
+@.str = private unnamed_addr addrspace(4) constant [3 x i8] c"%s\00", align 1
+@.str.1 = private unnamed_addr addrspace(4) constant [6 x i8] c"hello\00", align 1
+@.str.2 = private unnamed_addr addrspace(4) constant [6 x i8] c"world\00", align 1
+
+
+define amdgpu_kernel void @test_kernel() {
+entry:
+  %q = alloca ptr, align 8, addrspace(5)
+  %p = alloca i32, align 4, addrspace(5)
+  %q.ascast = addrspacecast ptr addrspace(5) %q to ptr
+  %p.ascast = addrspacecast ptr addrspace(5) %p to ptr
+  store i32 25, ptr %p.ascast, align 4
+  %0 = load i32, ptr %p.ascast, align 4
+  %cmp = icmp sgt i32 %0, 30
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  store ptr addrspacecast (ptr addrspace(4) @.str.1 to ptr), ptr %q.ascast, align 8
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  store ptr addrspacecast (ptr addrspace(4) @.str.2 to ptr), ptr %q.ascast, align 8
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %1 = load ptr, ptr %q.ascast, align 8
+  %call = call i32 (ptr, ...) @printf(ptr noundef addrspacecast (ptr addrspace(4) @.str to ptr), ptr noundef %1)
+  ret void
+}
+
+declare i32 @printf(ptr, ...)
\ No newline at end of file
Index: llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
@@ -22,6 +22,7 @@
 #include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
@@ -62,7 +63,7 @@
   void getConversionSpecifiers(SmallVectorImpl<char> &OpConvSpecifiers,
                                StringRef fmt, size_t num_ops) const;
 
-  bool shouldPrintAsStr(char Specifier, Type *OpType) const;
+  bool shouldPrintAsStr(char Specifier) const { return (Specifier == 's'); }
   bool lowerPrintfForGpu(Module &M);
 
   Value *simplify(Instruction *I, const TargetLibraryInfo *TLI,
@@ -131,79 +132,111 @@
   }
 }
 
-bool AMDGPUPrintfRuntimeBindingImpl::shouldPrintAsStr(char Specifier,
-                                                      Type *OpType) const {
-  if (Specifier != 's')
-    return false;
-  const PointerType *PT = dyn_cast<PointerType>(OpType);
-  if (!PT || PT->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
-    return false;
-  Type *ElemType = PT->getContainedType(0);
-  if (ElemType->getTypeID() != Type::IntegerTyID)
-    return false;
-  IntegerType *ElemIType = cast<IntegerType>(ElemType);
-  return ElemIType->getBitWidth() == 8;
+// This function is essentially a copy from the file
+// Transforms/Utils/AMDGPUEmitPrintf.cpp
+static Value *getStrlenWithNull(IRBuilder<> &Builder, Value *Str) {
+  auto *Prev = Builder.GetInsertBlock();
+  Module *M = Prev->getModule();
+
+  auto CharZero = Builder.getInt8(0);
+  auto One = Builder.getInt64(1);
+  auto Zero = Builder.getInt64(0);
+  auto Int64Ty = Builder.getInt64Ty();
+
+  // The length is either zero for a null pointer, or the computed value for an
+  // actual string. We need a join block for a phi that represents the final
+  // value.
+  BasicBlock *Join = nullptr;
+  if (Prev->getTerminator()) {
+    Join = Prev->splitBasicBlock(Builder.GetInsertPoint(), "strlen.join");
+    Prev->getTerminator()->eraseFromParent();
+  } else {
+    Join =
+        BasicBlock::Create(M->getContext(), "strlen.join", Prev->getParent());
+  }
+  BasicBlock *While = BasicBlock::Create(M->getContext(), "strlen.while",
+                                         Prev->getParent(), Join);
+  BasicBlock *WhileDone = BasicBlock::Create(
+      M->getContext(), "strlen.while.done", Prev->getParent(), Join);
+
+  // Emit an early return for when the pointer is null.
+  Builder.SetInsertPoint(Prev);
+  auto CmpNull =
+      Builder.CreateICmpEQ(Str, Constant::getNullValue(Str->getType()));
+  BranchInst::Create(Join, While, CmpNull, Prev);
+
+  // Entry to the while loop.
+  Builder.SetInsertPoint(While);
+
+  auto PtrPhi = Builder.CreatePHI(Str->getType(), 2);
+  PtrPhi->addIncoming(Str, Prev);
+  auto PtrNext = Builder.CreateGEP(Builder.getInt8Ty(), PtrPhi, One);
+  PtrPhi->addIncoming(PtrNext, While);
+
+  // Condition for the while loop.
+  auto Data = Builder.CreateLoad(Builder.getInt8Ty(), PtrPhi);
+  auto Cmp = Builder.CreateICmpEQ(Data, CharZero);
+  Builder.CreateCondBr(Cmp, WhileDone, While);
+
+  // Add one to the computed length.
+  Builder.SetInsertPoint(WhileDone, WhileDone->begin());
+  auto Begin = Builder.CreatePtrToInt(Str, Int64Ty);
+  auto End = Builder.CreatePtrToInt(PtrPhi, Int64Ty);
+  auto Len = Builder.CreateSub(End, Begin);
+  Len = Builder.CreateAdd(Len, One);
+
+  // Final join.
+  BranchInst::Create(Join, WhileDone);
+  Builder.SetInsertPoint(Join, Join->begin());
+  auto LenPhi = Builder.CreatePHI(Len->getType(), 2);
+  LenPhi->addIncoming(Len, WhileDone);
+  LenPhi->addIncoming(Zero, Prev);
+
+  return LenPhi;
 }
 
 bool AMDGPUPrintfRuntimeBindingImpl::lowerPrintfForGpu(Module &M) {
   LLVMContext &Ctx = M.getContext();
   IRBuilder<> Builder(Ctx);
-  Type *I32Ty = Type::getInt32Ty(Ctx);
+  Type *Int32Ty = Type::getInt32Ty(Ctx);
+  Type *Int64Ty = Type::getInt64Ty(Ctx);
   unsigned UniqID = 0;
-  // NB: This is important for this string size to be divisible by 4
-  const char NonLiteralStr[4] = "???";
 
   for (auto *CI : Printfs) {
     unsigned NumOps = CI->arg_size();
 
+    StringRef Str;
     SmallString<16> OpConvSpecifiers;
     Value *Op = CI->getArgOperand(0);
 
-    if (auto LI = dyn_cast<LoadInst>(Op)) {
-      Op = LI->getPointerOperand();
-      for (auto *Use : Op->users()) {
-        if (auto SI = dyn_cast<StoreInst>(Use)) {
-          Op = SI->getValueOperand();
-          break;
-        }
-      }
-    }
-
-    if (auto I = dyn_cast<Instruction>(Op)) {
-      Value *Op_simplified =
-          simplify(I, &GetTLI(*I->getFunction()), &GetDT(*I->getFunction()));
-      if (Op_simplified)
-        Op = Op_simplified;
-    }
+    // helper struct to package the string related data
+    typedef struct S {
+      StringRef Str;
+      llvm::Value *RealSize;
+      llvm::Value *AlignedSize;
 
-    ConstantExpr *ConstExpr = dyn_cast<ConstantExpr>(Op);
+      S(StringRef str, llvm::Value *RS, llvm::Value *AS)
+          : Str(str), RealSize(RS), AlignedSize(AS) {}
+    } StringData;
 
-    if (ConstExpr) {
-      GlobalVariable *GVar = dyn_cast<GlobalVariable>(ConstExpr->getOperand(0));
+    if (getConstantStringInfo(Op, Str) && !Str.empty()) {
+      // we need this call to ascertain
+      // that we are printing a string
+      // or a pointer. It takes out the
+      // specifiers and fills up the first
+      // arg
+      getConversionSpecifiers(OpConvSpecifiers, Str, NumOps - 1);
 
-      StringRef Str("unknown");
-      if (GVar && GVar->hasInitializer()) {
-        auto *Init = GVar->getInitializer();
-        if (auto *CA = dyn_cast<ConstantDataArray>(Init)) {
-          if (CA->isString())
-            Str = CA->getAsCString();
-        } else if (isa<ConstantAggregateZero>(Init)) {
-          Str = "";
-        }
-        //
-        // we need this call to ascertain
-        // that we are printing a string
-        // or a pointer. It takes out the
-        // specifiers and fills up the first
-        // arg
-        getConversionSpecifiers(OpConvSpecifiers, Str, NumOps - 1);
-      }
       // Add metadata for the string
       std::string AStreamHolder;
       raw_string_ostream Sizes(AStreamHolder);
       int Sum = DWORD_ALIGN;
       Sizes << CI->arg_size() - 1;
       Sizes << ':';
+      uint32_t NonConstStringCnt = 0;
+      Value *NonConstStringValue = nullptr;
+      FunctionCallee strlenFunc, AlignFunc;
+      SmallVector<StringData, 8> StringContents;
       for (unsigned ArgCount = 1;
            ArgCount < CI->arg_size() && ArgCount <= OpConvSpecifiers.size();
            ArgCount++) {
@@ -216,11 +249,11 @@
         // expand the arguments that do not follow this rule.
         //
         if (ArgSize % DWORD_ALIGN != 0) {
-          llvm::Type *ResType = llvm::Type::getInt32Ty(Ctx);
+          llvm::Type *ResType = Int32Ty;
           auto *LLVMVecType = llvm::dyn_cast<llvm::FixedVectorType>(ArgType);
           int NumElem = LLVMVecType ? LLVMVecType->getNumElements() : 1;
           if (LLVMVecType && NumElem > 1)
-            ResType = llvm::FixedVectorType::get(ResType, NumElem);
+            ResType = llvm::FixedVectorType::get(Int32Ty, NumElem);
           Builder.SetInsertPoint(CI);
           Builder.SetCurrentDebugLocation(CI->getDebugLoc());
           if (OpConvSpecifiers[ArgCount - 1] == 'x' ||
@@ -246,34 +279,56 @@
               ArgSize = 4;
           }
         }
-        if (shouldPrintAsStr(OpConvSpecifiers[ArgCount - 1], ArgType)) {
-          if (auto *ConstExpr = dyn_cast<ConstantExpr>(Arg)) {
-            auto *GV = dyn_cast<GlobalVariable>(ConstExpr->getOperand(0));
-            if (GV && GV->hasInitializer()) {
-              Constant *Init = GV->getInitializer();
-              bool IsZeroValue = Init->isZeroValue();
-              auto *CA = dyn_cast<ConstantDataArray>(Init);
-              if (IsZeroValue || (CA && CA->isString())) {
-                size_t SizeStr =
-                    IsZeroValue ? 1 : (strlen(CA->getAsCString().data()) + 1);
-                size_t Rem = SizeStr % DWORD_ALIGN;
-                size_t NSizeStr = 0;
-                LLVM_DEBUG(dbgs() << "Printf string original size = " << SizeStr
-                                  << '\n');
-                if (Rem) {
-                  NSizeStr = SizeStr + (DWORD_ALIGN - Rem);
-                } else {
-                  NSizeStr = SizeStr;
-                }
-                ArgSize = NSizeStr;
-              }
+        if (shouldPrintAsStr(OpConvSpecifiers[ArgCount - 1])) {
+          StringRef S;
+          if (getConstantStringInfo(Arg, S, /*TrimAtNul*/ false)) {
+            size_t SizeStr = S.size();
+            size_t Rem = SizeStr % DWORD_ALIGN;
+            LLVM_DEBUG(dbgs()
+                       << "Printf string original size = " << SizeStr << '\n');
+            size_t NSizeStr;
+            if (Rem) {
+              NSizeStr = SizeStr + (DWORD_ALIGN - Rem);
             } else {
-              ArgSize = sizeof(NonLiteralStr);
+              NSizeStr = SizeStr;
             }
+            ArgSize = NSizeStr;
+
+            auto AlignedSize = ConstantInt::get(Int32Ty, ArgSize, false);
+            StringContents.push_back(StringData(S, AlignedSize, AlignedSize));
+            Sum += ArgSize;
           } else {
-            ArgSize = sizeof(NonLiteralStr);
+            Builder.SetInsertPoint(CI);
+            auto strlen = getStrlenWithNull(Builder, Arg);
+
+            // Align the computed length to next 4 byte boundary
+            auto Temp = Builder.CreateAdd(
+                strlen, ConstantInt::get(strlen->getType(), 3U));
+            auto alignedLen = Builder.CreateAnd(
+                Temp, ConstantInt::get(Type::getInt64Ty(Ctx), ~3U));
+
+            if (NonConstStringCnt) {
+              Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+              auto Val = Builder.CreateAdd(alignedLen, NonConstStringValue,
+                                           "cumulativeAdd");
+              NonConstStringValue = Val;
+            } else
+              NonConstStringValue = alignedLen;
+
+            // actual string not known here, hence keep the field empty.
+            StringContents.push_back(StringData("", strlen, alignedLen));
+            NonConstStringCnt++;
           }
+
+          // The non const string sizes in metadata are always updated
+          // to be size of the pointer(8 bytes).
+          // The runtime handles the calculation of actual string sizes.
+          Sizes << 8 << ':';
+
+          // string argument handled, continue with next argument
+          continue;
         }
+
         LLVM_DEBUG(dbgs() << "Printf ArgSize (in buffer) = " << ArgSize
                           << " for type: " << *ArgType << '\n');
         Sizes << ArgSize << ':';
@@ -281,6 +336,7 @@
       }
       LLVM_DEBUG(dbgs() << "Printf format string in source = " << Str.str()
                         << '\n');
+
       for (char C : Str) {
         // Rest of the C escape sequences (e.g. \') are handled correctly
         // by the MDParser
@@ -321,9 +377,7 @@
       AttributeList Attr = AttributeList::get(Ctx, AttributeList::FunctionIndex,
                                               Attribute::NoUnwind);
 
-      Type *SizetTy = Type::getInt32Ty(Ctx);
-
-      Type *Tys_alloc[1] = {SizetTy};
+      Type *Tys_alloc[1] = {Int32Ty};
       Type *I8Ty = Type::getInt8Ty(Ctx);
       Type *I8Ptr = PointerType::get(I8Ty, 1);
       FunctionType *FTy_alloc = FunctionType::get(I8Ptr, Tys_alloc, false);
@@ -345,8 +399,14 @@
       NamedMDNode *metaD = M.getOrInsertNamedMetadata(amd);
       MDNode *myMD = MDNode::get(Ctx, fmtStrArray);
       metaD->addOperand(myMD);
-      Value *sumC = ConstantInt::get(SizetTy, Sum, false);
+      Value *sumC = ConstantInt::get(Type::getInt64Ty(Ctx), Sum, false);
       SmallVector<Value *, 1> alloc_args;
+      if (NonConstStringValue)
+        sumC = Builder.CreateAdd(NonConstStringValue, sumC);
+
+      // Truncate the string size to 32 bits,
+      sumC = Builder.CreateTrunc(sumC, Int32Ty);
+
       alloc_args.push_back(sumC);
       CallInst *pcall =
           CallInst::Create(PrintfAllocFn, alloc_args, "printf_alloc_fn", CI);
@@ -361,7 +421,7 @@
       auto *cmp = cast<ICmpInst>(Builder.CreateICmpNE(pcall, zeroIntPtr, ""));
       if (!CI->use_empty()) {
         Value *result =
-            Builder.CreateSExt(Builder.CreateNot(cmp), I32Ty, "printf_res");
+            Builder.CreateSExt(Builder.CreateNot(cmp), Int32Ty, "printf_res");
         CI->replaceAllUsesWith(result);
       }
       SplitBlock(CI->getParent(), cmp);
@@ -376,11 +436,11 @@
           I8Ty, pcall, ConstantInt::get(Ctx, APInt(32, 0)), "PrintBuffID",
           Brnch);
 
-      Type *idPointer = PointerType::get(I32Ty, AMDGPUAS::GLOBAL_ADDRESS);
+      Type *idPointer = PointerType::get(Int32Ty, AMDGPUAS::GLOBAL_ADDRESS);
       Value *id_gep_cast =
           new BitCastInst(BufferIdx, idPointer, "PrintBuffIdCast", Brnch);
 
-      new StoreInst(ConstantInt::get(I32Ty, UniqID), id_gep_cast, Brnch);
+      new StoreInst(ConstantInt::get(Int32Ty, UniqID), id_gep_cast, Brnch);
 
       // 1st 4 bytes hold the printf_id
       // the following GEP is the buffer pointer
@@ -388,8 +448,7 @@
           I8Ty, pcall, ConstantInt::get(Ctx, APInt(32, 4)), "PrintBuffGep",
           Brnch);
 
-      Type *Int32Ty = Type::getInt32Ty(Ctx);
-      Type *Int64Ty = Type::getInt64Ty(Ctx);
+      int curStringIdx = 0;
       for (unsigned ArgCount = 1;
            ArgCount < CI->arg_size() && ArgCount <= OpConvSpecifiers.size();
            ArgCount++) {
@@ -417,44 +476,64 @@
           Arg = new BitCastInst(Arg, IType, "PrintArgFP", Brnch);
           WhatToStore.push_back(Arg);
         } else if (ArgType->getTypeID() == Type::PointerTyID) {
-          if (shouldPrintAsStr(OpConvSpecifiers[ArgCount - 1], ArgType)) {
-            const char *S = NonLiteralStr;
-            if (auto *ConstExpr = dyn_cast<ConstantExpr>(Arg)) {
-              auto *GV = dyn_cast<GlobalVariable>(ConstExpr->getOperand(0));
-              if (GV && GV->hasInitializer()) {
-                Constant *Init = GV->getInitializer();
-                bool IsZeroValue = Init->isZeroValue();
-                auto *CA = dyn_cast<ConstantDataArray>(Init);
-                if (IsZeroValue || (CA && CA->isString())) {
-                  S = IsZeroValue ? "" : CA->getAsCString().data();
+          if (shouldPrintAsStr(OpConvSpecifiers[ArgCount - 1])) {
+            // get the string "Arg" points to
+            if (auto ConstVal = dyn_cast<ConstantInt>(
+                    StringContents[curStringIdx].RealSize)) {
+              // I guess its safe to use ConstVal directly without guards here
+              // since we know the string contents already,
+              auto NSizeStr = ConstVal->getZExtValue();
+              StringRef S = StringContents[curStringIdx].Str;
+
+              // since we know string contents, push them to printf buffer
+              // as 4 byte chunks rather than using memcpy.
+              if (S[0]) {
+                char *MyNewStr = new char[NSizeStr]();
+                strcpy(MyNewStr, S.str().c_str());
+                int NumInts = NSizeStr / 4;
+                int CharC = 0;
+                while (NumInts) {
+                  int ANum = *(int *)(MyNewStr + CharC);
+                  CharC += 4;
+                  NumInts--;
+                  Value *ANumV = ConstantInt::get(Int32Ty, ANum, false);
+                  WhatToStore.push_back(ANumV);
                 }
-              }
-            }
-            size_t SizeStr = strlen(S) + 1;
-            size_t Rem = SizeStr % DWORD_ALIGN;
-            size_t NSizeStr = 0;
-            if (Rem) {
-              NSizeStr = SizeStr + (DWORD_ALIGN - Rem);
-            } else {
-              NSizeStr = SizeStr;
-            }
-            if (S[0]) {
-              char *MyNewStr = new char[NSizeStr]();
-              strcpy(MyNewStr, S);
-              int NumInts = NSizeStr / 4;
-              int CharC = 0;
-              while (NumInts) {
-                int ANum = *(int *)(MyNewStr + CharC);
-                CharC += 4;
-                NumInts--;
-                Value *ANumV = ConstantInt::get(Int32Ty, ANum, false);
+                delete[] MyNewStr;
+              } else {
+                // Empty string, give a hint to RT it is no NULL
+                Value *ANumV = ConstantInt::get(Int32Ty, 0xFFFFFF00, false);
                 WhatToStore.push_back(ANumV);
               }
-              delete[] MyNewStr;
+              curStringIdx++;
             } else {
-              // Empty string, give a hint to RT it is no NULL
-              Value *ANumV = ConstantInt::get(Int32Ty, 0xFFFFFF00, false);
-              WhatToStore.push_back(ANumV);
+              auto val = StringContents[curStringIdx].RealSize;
+              Type *Tys[] = {BufferIdx->getType(), Arg->getType(),
+                             val->getType()};
+              Function *TheFn =
+                  Intrinsic::getDeclaration(&M, Intrinsic::memcpy, Tys);
+              SmallVector<Value *, 1> BuffOffset;
+
+              Value *Args[] = {BufferIdx, Arg, val,
+                               ConstantInt::get(Type::getInt1Ty(Ctx), false)};
+
+              // This copies the contents of the string, however the next offset
+              // is at aligned length, the extra space that might be created due
+              // to alignment padding is not populated with any specific value
+              // here, I feel this would be safe as long as runtime is sync with
+              // the offsets.
+              CallInst::Create(TheFn->getFunctionType(), TheFn, Args,
+                               llvm::None, "", Brnch);
+
+              Builder.SetInsertPoint(Brnch);
+              BuffOffset.push_back(StringContents[curStringIdx].AlignedSize);
+              BufferIdx = GetElementPtrInst::Create(I8Ty, BufferIdx, BuffOffset,
+                                                    "PrintBuffNextPtr", Brnch);
+              LLVM_DEBUG(dbgs() << "inserting gep to the printf buffer:\n"
+                                << *BufferIdx << '\n');
+              curStringIdx++;
+              // done with current argument, move to next
+              continue;
             }
           } else {
             uint64_t Size = TD->getTypeAllocSizeInBits(ArgType);
@@ -516,7 +595,7 @@
           unsigned ArgSize =
               TD->getTypeAllocSizeInBits(TheBtCast->getType()) / 8;
           SmallVector<Value *, 1> BuffOffset;
-          BuffOffset.push_back(ConstantInt::get(I32Ty, ArgSize));
+          BuffOffset.push_back(ConstantInt::get(Int32Ty, ArgSize));
 
           Type *ArgPointer = PointerType::get(TheBtCast->getType(), 1);
           Value *CastedGEP =
Index: clang/lib/Driver/ToolChains/Clang.cpp
===================================================================
--- clang/lib/Driver/ToolChains/Clang.cpp
+++ clang/lib/Driver/ToolChains/Clang.cpp
@@ -4652,8 +4652,19 @@
     }
     CmdArgs.push_back("-aux-triple");
     CmdArgs.push_back(Args.MakeArgString(NormalizedTriple));
+
+    if (JA.isDeviceOffloading(Action::OFK_HIP)) {
+      // Device side compilation printf
+      if (Args.getLastArg(options::OPT_fdelayed_printf))
+        CmdArgs.push_back("-fdelayed-printf");
+    }
   }
 
+  // unconditionally claim the pritnf option now to avoid unused diagnostic
+  // TODO: OpenCL targets need to use this option too
+  if (const Arg *PF = Args.getLastArg(options::OPT_fdelayed_printf))
+    PF->claim();
+
   if (Args.hasFlag(options::OPT_fsycl, options::OPT_fno_sycl, false)) {
     CmdArgs.push_back("-fsycl-is-device");
 
Index: clang/lib/Basic/Builtins.cpp
===================================================================
--- clang/lib/Basic/Builtins.cpp
+++ clang/lib/Basic/Builtins.cpp
@@ -89,11 +89,15 @@
   bool CUDAUnsupported = !LangOpts.CUDA && BuiltinInfo.Langs == CUDA_LANG;
   bool CPlusPlusUnsupported =
       !LangOpts.CPlusPlus && BuiltinInfo.Langs == CXX_LANG;
+  // dependency of printf on "-fdelayed-printf" option
+  bool PrintfUnsupported = LangOpts.HIP &&
+                           llvm::StringRef(BuiltinInfo.Name).equals("printf") &&
+                           LangOpts.DelayedPrintf;
   return !BuiltinsUnsupported && !CorBuiltinsUnsupported &&
          !MathBuiltinsUnsupported && !OclCUnsupported && !OclGASUnsupported &&
          !OclPipeUnsupported && !OclDSEUnsupported && !OpenMPUnsupported &&
          !GnuModeUnsupported && !MSModeUnsupported && !ObjCUnsupported &&
-         !CPlusPlusUnsupported && !CUDAUnsupported;
+         !CPlusPlusUnsupported && !CUDAUnsupported && !PrintfUnsupported;
 }
 
 /// initializeBuiltins - Mark the identifiers for all the builtins with their
Index: clang/include/clang/Driver/Options.td
===================================================================
--- clang/include/clang/Driver/Options.td
+++ clang/include/clang/Driver/Options.td
@@ -981,6 +981,10 @@
   TargetOpts<"NVPTXUseShortPointers">, DefaultFalse,
   PosFlag<SetTrue, [CC1Option], "Use 32-bit pointers for accessing const/local/shared address spaces">,
   NegFlag<SetFalse>>;
+def fdelayed_printf : Flag<["-"], "fdelayed-printf">,
+  HelpText<"Specifies which version of printf is to be used while CodeGen">,
+  Flags<[CC1Option]>,
+  MarshallingInfoFlag<LangOpts<"DelayedPrintf">>;
 def fgpu_default_stream_EQ : Joined<["-"], "fgpu-default-stream=">,
   HelpText<"Specify default stream. The default value is 'legacy'. (HIP only)">,
   Flags<[CC1Option]>,
Index: clang/include/clang/Basic/LangOptions.def
===================================================================
--- clang/include/clang/Basic/LangOptions.def
+++ clang/include/clang/Basic/LangOptions.def
@@ -272,6 +272,7 @@
 LANGOPT(GPUDeferDiag, 1, 0, "defer host/device related diagnostic messages for CUDA/HIP")
 LANGOPT(GPUExcludeWrongSideOverloads, 1, 0, "always exclude wrong side overloads in overloading resolution for CUDA/HIP")
 LANGOPT(OffloadingNewDriver, 1, 0, "use the new driver for generating offloading code.")
+LANGOPT(DelayedPrintf, 1, 0, "version onf printf function to be used, hostcall or buffer based")
 
 LANGOPT(SYCLIsDevice      , 1, 0, "Generate code for SYCL device")
 LANGOPT(SYCLIsHost        , 1, 0, "SYCL host compilation")
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to