Re: [PATCH] D11664: [CUDA] Implemented additional processing steps needed to link with CUDA libdevice bitcode.
This revision was automatically updated to reflect the committed changes. Closed by commit rL247317: [CUDA] Postprocess bitcode linked in during device-side CUDA compilation. (authored by tra). Changed prior to commit: http://reviews.llvm.org/D11664?vs=34467&id=34470#toc Repository: rL LLVM http://reviews.llvm.org/D11664 Files: cfe/trunk/include/clang/Basic/LangOptions.def cfe/trunk/include/clang/Driver/CC1Options.td cfe/trunk/lib/CodeGen/CodeGenAction.cpp cfe/trunk/lib/Frontend/CompilerInvocation.cpp cfe/trunk/test/CodeGenCUDA/Inputs/device-code.ll cfe/trunk/test/CodeGenCUDA/link-device-bitcode.cu Index: cfe/trunk/lib/Frontend/CompilerInvocation.cpp === --- cfe/trunk/lib/Frontend/CompilerInvocation.cpp +++ cfe/trunk/lib/Frontend/CompilerInvocation.cpp @@ -1406,6 +1406,9 @@ if (Args.hasArg(OPT_fcuda_is_device)) Opts.CUDAIsDevice = 1; + if (Args.hasArg(OPT_fcuda_uses_libdevice)) +Opts.CUDAUsesLibDevice = 1; + if (Args.hasArg(OPT_fcuda_allow_host_calls_from_host_device)) Opts.CUDAAllowHostCallsFromHostDevice = 1; Index: cfe/trunk/lib/CodeGen/CodeGenAction.cpp === --- cfe/trunk/lib/CodeGen/CodeGenAction.cpp +++ cfe/trunk/lib/CodeGen/CodeGenAction.cpp @@ -159,7 +159,12 @@ if (LinkModule) { if (Linker::LinkModules( M, LinkModule.get(), -[=](const DiagnosticInfo &DI) { linkerDiagnosticHandler(DI); })) +[=](const DiagnosticInfo &DI) { linkerDiagnosticHandler(DI); }, +(LangOpts.CUDA && LangOpts.CUDAIsDevice && + LangOpts.CUDAUsesLibDevice) +? (Linker::Flags::LinkOnlyNeeded | + Linker::Flags::InternalizeLinkedSymbols) +: Linker::Flags::None)) return; } Index: cfe/trunk/include/clang/Driver/CC1Options.td === --- cfe/trunk/include/clang/Driver/CC1Options.td +++ cfe/trunk/include/clang/Driver/CC1Options.td @@ -659,6 +659,8 @@ HelpText<"Disable all cross-target (host, device, etc.) call checks in CUDA">; def fcuda_include_gpubinary : Separate<["-"], "fcuda-include-gpubinary">, HelpText<"Incorporate CUDA device-side binary into host object file.">; +def fcuda_uses_libdevice : Flag<["-"], "fcuda-uses-libdevice">, + HelpText<"Selectively link and internalize bitcode.">; } // let Flags = [CC1Option] Index: cfe/trunk/include/clang/Basic/LangOptions.def === --- cfe/trunk/include/clang/Basic/LangOptions.def +++ cfe/trunk/include/clang/Basic/LangOptions.def @@ -166,6 +166,7 @@ LANGOPT(CUDAIsDevice , 1, 0, "Compiling for CUDA device") LANGOPT(CUDAAllowHostCallsFromHostDevice, 1, 0, "Allow host device functions to call host functions") LANGOPT(CUDADisableTargetCallChecks, 1, 0, "Disable checks for call targets (host, device, etc.)") +LANGOPT(CUDAUsesLibDevice , 1, 0, "Selectively link and internalize bitcode.") LANGOPT(AssumeSaneOperatorNew , 1, 1, "implicit __attribute__((malloc)) for C++'s new operators") LANGOPT(SizedDeallocation , 1, 0, "enable sized deallocation functions") Index: cfe/trunk/test/CodeGenCUDA/Inputs/device-code.ll === --- cfe/trunk/test/CodeGenCUDA/Inputs/device-code.ll +++ cfe/trunk/test/CodeGenCUDA/Inputs/device-code.ll @@ -0,0 +1,38 @@ +; Simple bit of IR to mimic CUDA's libdevice. We want to be +; able to link with it and we need to make sure all __nvvm_reflect +; calls are eliminated by the time PTX has been produced. + +target triple = "nvptx-unknown-cuda" + +declare i32 @__nvvm_reflect(i8*) + +@"$str" = private addrspace(1) constant [8 x i8] c"USE_MUL\00" + +define void @unused_subfunc(float %a) { + ret void +} + +define void @used_subfunc(float %a) { + ret void +} + +define float @_Z17device_mul_or_addff(float %a, float %b) { + %reflect = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([8 x i8], [8 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*)) + %cmp = icmp ne i32 %reflect, 0 + br i1 %cmp, label %use_mul, label %use_add + +use_mul: + %ret1 = fmul float %a, %b + br label %exit + +use_add: + %ret2 = fadd float %a, %b + br label %exit + +exit: + %ret = phi float [%ret1, %use_mul], [%ret2, %use_add] + + call void @used_subfunc(float %ret) + + ret float %ret +} Index: cfe/trunk/test/CodeGenCUDA/link-device-bitcode.cu === --- cfe/trunk/test/CodeGenCUDA/link-device-bitcode.cu +++ cfe/trunk/test/CodeGenCUDA/link-device-bitcode.cu @@ -0,0 +1,56 @@ +// Test for linking with CUDA's libdevice as outlined in +// http://llvm.org/docs/NVPTXUsage.html#linking-with-libdevice +// +// REQUIRES: nvptx-registered-target +// +// Prepare
Re: [PATCH] D11664: [CUDA] Implemented additional processing steps needed to link with CUDA libdevice bitcode.
tra updated this revision to Diff 34467. tra added a comment. Removed unneeded #includes. http://reviews.llvm.org/D11664 Files: include/clang/Basic/LangOptions.def include/clang/Driver/CC1Options.td lib/CodeGen/CodeGenAction.cpp lib/Frontend/CompilerInvocation.cpp test/CodeGenCUDA/Inputs/device-code.ll test/CodeGenCUDA/link-device-bitcode.cu Index: test/CodeGenCUDA/link-device-bitcode.cu === --- /dev/null +++ test/CodeGenCUDA/link-device-bitcode.cu @@ -0,0 +1,56 @@ +// Test for linking with CUDA's libdevice as outlined in +// http://llvm.org/docs/NVPTXUsage.html#linking-with-libdevice +// +// REQUIRES: nvptx-registered-target +// +// Prepare bitcode file to link with +// RUN: %clang_cc1 -triple nvptx-unknown-cuda -emit-llvm-bc -o %t.bc \ +// RUN:%S/Inputs/device-code.ll +// +// Make sure function in device-code gets linked in and internalized. +// RUN: %clang_cc1 -triple nvptx-unknown-cuda -fcuda-is-device \ +// RUN:-mlink-bitcode-file %t.bc -fcuda-uses-libdevice -emit-llvm \ +// RUN:-disable-llvm-passes -o - %s \ +// RUN:| FileCheck %s -check-prefix CHECK-IR +// +// Make sure function in device-code gets linked but is not internalized +// without -fcuda-uses-libdevice +// RUN: %clang_cc1 -triple nvptx-unknown-cuda -fcuda-is-device \ +// RUN:-mlink-bitcode-file %t.bc -emit-llvm \ +// RUN:-disable-llvm-passes -o - %s \ +// RUN:| FileCheck %s -check-prefix CHECK-IR-NLD +// +// Make sure NVVMReflect pass is enabled in NVPTX back-end. +// RUN: %clang_cc1 -triple nvptx-unknown-cuda -fcuda-is-device \ +// RUN:-mlink-bitcode-file %t.bc -fcuda-uses-libdevice -S -o /dev/null %s \ +// RUN:-backend-option -debug-pass=Structure 2>&1 \ +// RUN:| FileCheck %s -check-prefix CHECK-REFLECT + +#include "Inputs/cuda.h" + +__device__ float device_mul_or_add(float a, float b); +extern "C" __device__ double __nv_sin(double x); +extern "C" __device__ double __nv_exp(double x); + +// CHECK-IR-LABEL: define void @_Z26should_not_be_internalizedPf( +// CHECK-PTX-LABEL: .visible .func _Z26should_not_be_internalizedPf( +__device__ void should_not_be_internalized(float *data) {} + +// Make sure kernel call has not been internalized. +// CHECK-IR-LABEL: define void @_Z6kernelPfS_ +// CHECK-PTX-LABEL: .visible .entry _Z6kernelPfS_( +__global__ __attribute__((used)) void kernel(float *out, float *in) { + *out = device_mul_or_add(in[0], in[1]); + *out += __nv_exp(__nv_sin(*out)); + should_not_be_internalized(out); +} + +// Make sure device_mul_or_add() is present in IR, is internal and +// calls __nvvm_reflect(). +// CHECK-IR-LABEL: define internal float @_Z17device_mul_or_addff( +// CHECK-IR-NLD-LABEL: define float @_Z17device_mul_or_addff( +// CHECK-IR: call i32 @__nvvm_reflect +// CHECK-IR: ret float + +// Verify that NVVMReflect pass is among the passes run by NVPTX back-end. +// CHECK-REFLECT: Replace occurrences of __nvvm_reflect() calls with 0/1 Index: test/CodeGenCUDA/Inputs/device-code.ll === --- /dev/null +++ test/CodeGenCUDA/Inputs/device-code.ll @@ -0,0 +1,38 @@ +; Simple bit of IR to mimic CUDA's libdevice. We want to be +; able to link with it and we need to make sure all __nvvm_reflect +; calls are eliminated by the time PTX has been produced. + +target triple = "nvptx-unknown-cuda" + +declare i32 @__nvvm_reflect(i8*) + +@"$str" = private addrspace(1) constant [8 x i8] c"USE_MUL\00" + +define void @unused_subfunc(float %a) { + ret void +} + +define void @used_subfunc(float %a) { + ret void +} + +define float @_Z17device_mul_or_addff(float %a, float %b) { + %reflect = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([8 x i8], [8 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*)) + %cmp = icmp ne i32 %reflect, 0 + br i1 %cmp, label %use_mul, label %use_add + +use_mul: + %ret1 = fmul float %a, %b + br label %exit + +use_add: + %ret2 = fadd float %a, %b + br label %exit + +exit: + %ret = phi float [%ret1, %use_mul], [%ret2, %use_add] + + call void @used_subfunc(float %ret) + + ret float %ret +} Index: lib/Frontend/CompilerInvocation.cpp === --- lib/Frontend/CompilerInvocation.cpp +++ lib/Frontend/CompilerInvocation.cpp @@ -1406,6 +1406,9 @@ if (Args.hasArg(OPT_fcuda_is_device)) Opts.CUDAIsDevice = 1; + if (Args.hasArg(OPT_fcuda_uses_libdevice)) +Opts.CUDAUsesLibDevice = 1; + if (Args.hasArg(OPT_fcuda_allow_host_calls_from_host_device)) Opts.CUDAAllowHostCallsFromHostDevice = 1; Index: lib/CodeGen/CodeGenAction.cpp === --- lib/CodeGen/CodeGenAction.cpp +++ lib/CodeGen/CodeGenAction.cpp @@ -159,7 +159,12 @@ if (LinkModule) { if (Linker::LinkModules( M, LinkModule.get(), -[=](const Diagnosti
Re: [PATCH] D11664: [CUDA] Implemented additional processing steps needed to link with CUDA libdevice bitcode.
echristo added a comment. Works for me, thanks. http://reviews.llvm.org/D11664 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
Re: [PATCH] D11664: [CUDA] Implemented additional processing steps needed to link with CUDA libdevice bitcode.
tra updated the summary for this revision. tra updated this revision to Diff 34251. tra added a comment. Assume (and test) that NVPTX back-end includes NVVMReflect by default. http://reviews.llvm.org/D11664 Files: include/clang/Basic/LangOptions.def include/clang/Driver/CC1Options.td lib/CodeGen/CodeGenAction.cpp lib/Frontend/CompilerInvocation.cpp test/CodeGenCUDA/Inputs/device-code.ll test/CodeGenCUDA/link-device-bitcode.cu Index: test/CodeGenCUDA/link-device-bitcode.cu === --- /dev/null +++ test/CodeGenCUDA/link-device-bitcode.cu @@ -0,0 +1,56 @@ +// Test for linking with CUDA's libdevice as outlined in +// http://llvm.org/docs/NVPTXUsage.html#linking-with-libdevice +// +// REQUIRES: nvptx-registered-target +// +// Prepare bitcode file to link with +// RUN: %clang_cc1 -triple nvptx-unknown-cuda -emit-llvm-bc -o %t.bc \ +// RUN:%S/Inputs/device-code.ll +// +// Make sure function in device-code gets linked in and internalized. +// RUN: %clang_cc1 -triple nvptx-unknown-cuda -fcuda-is-device \ +// RUN:-mlink-bitcode-file %t.bc -fcuda-uses-libdevice -emit-llvm \ +// RUN:-disable-llvm-passes -o - %s \ +// RUN:| FileCheck %s -check-prefix CHECK-IR +// +// Make sure function in device-code gets linked but is not internalized +// without -fcuda-uses-libdevice +// RUN: %clang_cc1 -triple nvptx-unknown-cuda -fcuda-is-device \ +// RUN:-mlink-bitcode-file %t.bc -emit-llvm \ +// RUN:-disable-llvm-passes -o - %s \ +// RUN:| FileCheck %s -check-prefix CHECK-IR-NLD +// +// Make sure NVVMReflect pass is enabled in NVPTX back-end. +// RUN: %clang_cc1 -triple nvptx-unknown-cuda -fcuda-is-device \ +// RUN:-mlink-bitcode-file %t.bc -fcuda-uses-libdevice -S -o /dev/null %s \ +// RUN:-backend-option -debug-pass=Structure 2>&1 \ +// RUN:| FileCheck %s -check-prefix CHECK-REFLECT + +#include "Inputs/cuda.h" + +__device__ float device_mul_or_add(float a, float b); +extern "C" __device__ double __nv_sin(double x); +extern "C" __device__ double __nv_exp(double x); + +// CHECK-IR-LABEL: define void @_Z26should_not_be_internalizedPf( +// CHECK-PTX-LABEL: .visible .func _Z26should_not_be_internalizedPf( +__device__ void should_not_be_internalized(float *data) {} + +// Make sure kernel call has not been internalized. +// CHECK-IR-LABEL: define void @_Z6kernelPfS_ +// CHECK-PTX-LABEL: .visible .entry _Z6kernelPfS_( +__global__ __attribute__((used)) void kernel(float *out, float *in) { + *out = device_mul_or_add(in[0], in[1]); + *out += __nv_exp(__nv_sin(*out)); + should_not_be_internalized(out); +} + +// Make sure device_mul_or_add() is present in IR, is internal and +// calls __nvvm_reflect(). +// CHECK-IR-LABEL: define internal float @_Z17device_mul_or_addff( +// CHECK-IR-NLD-LABEL: define float @_Z17device_mul_or_addff( +// CHECK-IR: call i32 @__nvvm_reflect +// CHECK-IR: ret float + +// Verify that NVVMReflect pass is among the passes run by NVPTX back-end. +// CHECK-REFLECT: Replace occurrences of __nvvm_reflect() calls with 0/1 Index: test/CodeGenCUDA/Inputs/device-code.ll === --- /dev/null +++ test/CodeGenCUDA/Inputs/device-code.ll @@ -0,0 +1,38 @@ +; Simple bit of IR to mimic CUDA's libdevice. We want to be +; able to link with it and we need to make sure all __nvvm_reflect +; calls are eliminated by the time PTX has been produced. + +target triple = "nvptx-unknown-cuda" + +declare i32 @__nvvm_reflect(i8*) + +@"$str" = private addrspace(1) constant [8 x i8] c"USE_MUL\00" + +define void @unused_subfunc(float %a) { + ret void +} + +define void @used_subfunc(float %a) { + ret void +} + +define float @_Z17device_mul_or_addff(float %a, float %b) { + %reflect = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([8 x i8], [8 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*)) + %cmp = icmp ne i32 %reflect, 0 + br i1 %cmp, label %use_mul, label %use_add + +use_mul: + %ret1 = fmul float %a, %b + br label %exit + +use_add: + %ret2 = fadd float %a, %b + br label %exit + +exit: + %ret = phi float [%ret1, %use_mul], [%ret2, %use_add] + + call void @used_subfunc(float %ret) + + ret float %ret +} Index: lib/Frontend/CompilerInvocation.cpp === --- lib/Frontend/CompilerInvocation.cpp +++ lib/Frontend/CompilerInvocation.cpp @@ -1406,6 +1406,9 @@ if (Args.hasArg(OPT_fcuda_is_device)) Opts.CUDAIsDevice = 1; + if (Args.hasArg(OPT_fcuda_uses_libdevice)) +Opts.CUDAUsesLibDevice = 1; + if (Args.hasArg(OPT_fcuda_allow_host_calls_from_host_device)) Opts.CUDAAllowHostCallsFromHostDevice = 1; Index: lib/CodeGen/CodeGenAction.cpp === --- lib/CodeGen/CodeGenAction.cpp +++ lib/CodeGen/CodeGenAction.cpp @@ -26,14 +26,16 @@ #include "llvm/IR/DebugInfo.h" #include "llvm/
Re: [PATCH] D11664: [CUDA] Implemented additional processing steps needed to link with CUDA libdevice bitcode.
echristo added a comment. Inline comment. Comment at: test/CodeGenCUDA/link-device-bitcode.cu:23-28 @@ +22,8 @@ +// +// NVVMReflect is a target-specific pass runs after -emit-llvm prints +// IR, so we need to check NVPTX to make sure that the pass did happen +// and __nvvm_reflect calls were eliminated. +// RUN: %clang_cc1 -triple nvptx-unknown-cuda -fcuda-is-device \ +// RUN:-mlink-bitcode-file %t.bc -fcuda-uses-libdevice -S -o - %s \ +// RUN:| FileCheck %s -check-prefix CHECK-PTX + It would be better here if we could just check the pass structure and rely on the backend to test that the pass works. http://reviews.llvm.org/D11664 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
Re: [PATCH] D11664: [CUDA] Implemented additional processing steps needed to link with CUDA libdevice bitcode.
tra updated the summary for this revision. tra updated this revision to Diff 33713. tra added a comment. Updated the patch to use bitcode linker to perform selective linking and internalizing. Removed Internalize+GDCE passes. http://reviews.llvm.org/D11664 Files: include/clang/Basic/LangOptions.def include/clang/Driver/CC1Options.td lib/CodeGen/BackendUtil.cpp lib/CodeGen/CodeGenAction.cpp lib/Frontend/CompilerInvocation.cpp test/CodeGenCUDA/Inputs/device-code.ll test/CodeGenCUDA/link-device-bitcode.cu Index: test/CodeGenCUDA/link-device-bitcode.cu === --- /dev/null +++ test/CodeGenCUDA/link-device-bitcode.cu @@ -0,0 +1,61 @@ +// Test for linking with CUDA's libdevice as outlined in +// http://llvm.org/docs/NVPTXUsage.html#linking-with-libdevice +// +// REQUIRES: nvptx-registered-target +// +// Prepare bitcode file to link with +// RUN: %clang_cc1 -triple nvptx-unknown-cuda -emit-llvm-bc -o %t.bc \ +// RUN:%S/Inputs/device-code.ll +// +// Make sure function in device-code gets linked in and internalized. +// RUN: %clang_cc1 -triple nvptx-unknown-cuda -fcuda-is-device \ +// RUN:-mlink-bitcode-file %t.bc -fcuda-uses-libdevice -emit-llvm \ +// RUN:-disable-llvm-passes -o - %s \ +// RUN:| FileCheck %s -check-prefix CHECK-IR +// +// Make sure function in device-code gets linked but is not internalized +// without -fcuda-uses-libdevice +// RUN: %clang_cc1 -triple nvptx-unknown-cuda -fcuda-is-device \ +// RUN:-mlink-bitcode-file %t.bc -emit-llvm \ +// RUN:-disable-llvm-passes -o - %s \ +// RUN:| FileCheck %s -check-prefix CHECK-IR-NLD +// +// NVVMReflect is a target-specific pass runs after -emit-llvm prints +// IR, so we need to check NVPTX to make sure that the pass did happen +// and __nvvm_reflect calls were eliminated. +// RUN: %clang_cc1 -triple nvptx-unknown-cuda -fcuda-is-device \ +// RUN:-mlink-bitcode-file %t.bc -fcuda-uses-libdevice -S -o - %s \ +// RUN:| FileCheck %s -check-prefix CHECK-PTX + +#include "Inputs/cuda.h" + +__device__ float device_mul_or_add(float a, float b); +extern "C" __device__ double __nv_sin(double x); +extern "C" __device__ double __nv_exp(double x); + +// CHECK-IR-LABEL: define void @_Z26should_not_be_internalizedPf( +// CHECK-PTX-LABEL: .visible .func _Z26should_not_be_internalizedPf( +__device__ void should_not_be_internalized(float *data) {} + +// Make sure kernel call has not been internalized. +// CHECK-IR-LABEL: define void @_Z6kernelPfS_ +// CHECK-PTX-LABEL: .visible .entry _Z6kernelPfS_( +__global__ __attribute__((used)) void kernel(float *out, float *in) { + *out = device_mul_or_add(in[0], in[1]); + *out += __nv_exp(__nv_sin(*out)); + should_not_be_internalized(out); +} + +// Make sure device_mul_or_add() is present in IR, is internal and +// calls __nvvm_reflect(). +// CHECK-IR-LABEL: define internal float @_Z17device_mul_or_addff( +// CHECK-IR-NLD-LABEL: define float @_Z17device_mul_or_addff( +// CHECK-IR: call i32 @__nvvm_reflect +// CHECK-IR: ret float + +// By the time device_mul_or_add() makes it to PTX, __nvvm_reflect references +// should be gone. +// CHECK-PTX-NOT: .visible +// CHECK-PTX-LABEL: .func (.param .b32 func_retval0) _Z17device_mul_or_addff( +// CHECK-PTX-NOT: __nvvm_reflect +// CHECK-PTX: ret; Index: test/CodeGenCUDA/Inputs/device-code.ll === --- /dev/null +++ test/CodeGenCUDA/Inputs/device-code.ll @@ -0,0 +1,38 @@ +; Simple bit of IR to mimic CUDA's libdevice. We want to be +; able to link with it and we need to make sure all __nvvm_reflect +; calls are eliminated by the time PTX has been produced. + +target triple = "nvptx-unknown-cuda" + +declare i32 @__nvvm_reflect(i8*) + +@"$str" = private addrspace(1) constant [8 x i8] c"USE_MUL\00" + +define void @unused_subfunc(float %a) { + ret void +} + +define void @used_subfunc(float %a) { + ret void +} + +define float @_Z17device_mul_or_addff(float %a, float %b) { + %reflect = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([8 x i8], [8 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*)) + %cmp = icmp ne i32 %reflect, 0 + br i1 %cmp, label %use_mul, label %use_add + +use_mul: + %ret1 = fmul float %a, %b + br label %exit + +use_add: + %ret2 = fadd float %a, %b + br label %exit + +exit: + %ret = phi float [%ret1, %use_mul], [%ret2, %use_add] + + call void @used_subfunc(float %ret) + + ret float %ret +} Index: lib/Frontend/CompilerInvocation.cpp === --- lib/Frontend/CompilerInvocation.cpp +++ lib/Frontend/CompilerInvocation.cpp @@ -1406,6 +1406,9 @@ if (Args.hasArg(OPT_fcuda_is_device)) Opts.CUDAIsDevice = 1; + if (Args.hasArg(OPT_fcuda_uses_libdevice)) +Opts.CUDAUsesLibDevice = 1; + if (Args.hasArg(OPT_fcuda_allow_host_calls_from_host_device)) Opts.CUDAAllowHostCallsFromHostDevi
Re: [PATCH] D11664: [CUDA] Implemented additional processing steps needed to link with CUDA libdevice bitcode.
echristo accepted this revision. echristo added a comment. This revision is now accepted and ready to land. The ternary is a bit ugly, but LGTM. :) -eric http://reviews.llvm.org/D11664 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
Re: [PATCH] D11664: [CUDA] Implemented additional processing steps needed to link with CUDA libdevice bitcode.
echristo added inline comments. Comment at: lib/CodeGen/CodeGenAction.cpp:181-190 @@ -166,2 +180,12 @@ return; +if (LangOpts.CUDA && LangOpts.CUDAIsDevice && +LangOpts.CUDAUsesLibDevice) { + legacy::PassManager passes; + passes.add(createInternalizePass(ModuleFuncNames)); + // Considering that most of the functions we've linked are + // not going to be used, we may want to eliminate them + // early. + passes.add(createGlobalDCEPass()); + passes.run(*TheModule); +} } tra wrote: > echristo wrote: > > Seems like this should be part of the normal IPO pass run? This seems like > > an odd place to put this, can you explain why a bit more? > It will indeed happen during normal optimization, but as NVPTX docs says it > makes fair amount of sense to eliminate quite a bit of bitcode that we know > we're not going to need. libdevice carries ~450 functions and only handful of > those are needed. Why run all other optimization passes on them? > > In addition to that, we need to pass to Internalize list of symbols to > preserve. As far as I can tell the way to do it within normal optimization > pipeline is to pass them to back-end via > -internalize-public-api-list/-internalize-public-api-file. That's not > particularly suitable way to carry potentially large list of symbols we will > find in the TU we're dealing with. > > I could move GDCE to LLVM where it would arguably be somewhat more effective > if done after NVVMReflect, but keeping it next to internalize makes it easier > to see that we intentionally internalize and eliminate unused bitcode here. I might not have been clear. I'm curious why all of this isn't just part of the normal IPO pass run that should be happening on the code anyhow? Taking a step back - this should just go through the normal "let's set up a pipeline for the code", which might end up being cuda specific, but should be handled in the same way. That make sense? http://reviews.llvm.org/D11664 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
Re: [PATCH] D11664: [CUDA] Implemented additional processing steps needed to link with CUDA libdevice bitcode.
tra added inline comments. Comment at: lib/CodeGen/CodeGenAction.cpp:166-170 @@ +165,7 @@ +std::vector ModuleFuncNames; +// We need to internalize contents of the linked module but it +// has to be done *after* the linking because internalized +// symbols will not be linked in otherwise. +// In order to do that, we preserve current list of function names in +// the module and then pass it to Internalize pass to preserve. +if (LangOpts.CUDA && LangOpts.CUDAIsDevice && echristo wrote: > Can you explain this in a different way perhaps? I'm not sure what you mean > here. From llvm.org/docs/NVPTXUsage.html This patch implements following items: > The internalize pass is also recommended to remove unused math functions from > the resulting PTX. For an input IR module module.bc, the following > compilation flow is recommended: > > 1 Save list of external functions in module.bc > 2 Link module.bc with libdevice.compute_XX.YY.bc > 3 Internalize all functions not in list from (1) > 4 Eliminate all unused internal functions LLVM part of the changes takes care of NVVMReflect: > * Run NVVMReflect pass > * Run standard optimization pipeline Comment at: lib/CodeGen/CodeGenAction.cpp:181-190 @@ -166,2 +180,12 @@ return; +if (LangOpts.CUDA && LangOpts.CUDAIsDevice && +LangOpts.CUDAUsesLibDevice) { + legacy::PassManager passes; + passes.add(createInternalizePass(ModuleFuncNames)); + // Considering that most of the functions we've linked are + // not going to be used, we may want to eliminate them + // early. + passes.add(createGlobalDCEPass()); + passes.run(*TheModule); +} } echristo wrote: > Seems like this should be part of the normal IPO pass run? This seems like an > odd place to put this, can you explain why a bit more? It will indeed happen during normal optimization, but as NVPTX docs says it makes fair amount of sense to eliminate quite a bit of bitcode that we know we're not going to need. libdevice carries ~450 functions and only handful of those are needed. Why run all other optimization passes on them? In addition to that, we need to pass to Internalize list of symbols to preserve. As far as I can tell the way to do it within normal optimization pipeline is to pass them to back-end via -internalize-public-api-list/-internalize-public-api-file. That's not particularly suitable way to carry potentially large list of symbols we will find in the TU we're dealing with. I could move GDCE to LLVM where it would arguably be somewhat more effective if done after NVVMReflect, but keeping it next to internalize makes it easier to see that we intentionally internalize and eliminate unused bitcode here. http://reviews.llvm.org/D11664 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
Re: [PATCH] D11664: [CUDA] Implemented additional processing steps needed to link with CUDA libdevice bitcode.
echristo added inline comments. Comment at: lib/CodeGen/CodeGenAction.cpp:166-170 @@ +165,7 @@ +std::vector ModuleFuncNames; +// We need to internalize contents of the linked module but it +// has to be done *after* the linking because internalized +// symbols will not be linked in otherwise. +// In order to do that, we preserve current list of function names in +// the module and then pass it to Internalize pass to preserve. +if (LangOpts.CUDA && LangOpts.CUDAIsDevice && Can you explain this in a different way perhaps? I'm not sure what you mean here. Comment at: lib/CodeGen/CodeGenAction.cpp:181-190 @@ -166,2 +180,12 @@ return; +if (LangOpts.CUDA && LangOpts.CUDAIsDevice && +LangOpts.CUDAUsesLibDevice) { + legacy::PassManager passes; + passes.add(createInternalizePass(ModuleFuncNames)); + // Considering that most of the functions we've linked are + // not going to be used, we may want to eliminate them + // early. + passes.add(createGlobalDCEPass()); + passes.run(*TheModule); +} } Seems like this should be part of the normal IPO pass run? This seems like an odd place to put this, can you explain why a bit more? http://reviews.llvm.org/D11664 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits