[PATCH] D88734: [HIP] Align device binary
This revision was automatically updated to reflect the committed changes. yaxunl marked an inline comment as done. Closed by commit rGdc6a0b0ec7e3: [HIP] Align device binary (authored by yaxunl). Herald added a project: clang. Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D88734/new/ https://reviews.llvm.org/D88734 Files: clang/lib/CodeGen/CGCUDANV.cpp clang/lib/Driver/ToolChains/HIP.cpp clang/test/CodeGenCUDA/device-stub.cu clang/test/Driver/clang-offload-bundler.c clang/test/Driver/hip-toolchain-no-rdc.hip clang/test/Driver/hip-toolchain-rdc.hip clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp Index: clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp === --- clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp +++ clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp @@ -94,6 +94,11 @@ "instead of actually executing them - for testing purposes.\n"), cl::init(false), cl::cat(ClangOffloadBundlerCategory)); +static cl::opt +BundleAlignment("bundle-align", +cl::desc("Alignment of bundle for binary files"), +cl::init(1), cl::cat(ClangOffloadBundlerCategory)); + /// Magic string that marks the existence of offloading data. #define OFFLOAD_BUNDLER_MAGIC_STR "__CLANG_OFFLOAD_BUNDLE__" @@ -223,6 +228,9 @@ StringMap::iterator CurBundleInfo; StringMap::iterator NextBundleInfo; + /// Current bundle target to be written. + std::string CurWriteBundleTarget; + public: BinaryFileHandler() : FileHandler() {} @@ -337,10 +345,12 @@ unsigned Idx = 0; for (auto &T : TargetNames) { MemoryBuffer &MB = *Inputs[Idx++]; + HeaderSize = alignTo(HeaderSize, BundleAlignment); // Bundle offset. Write8byteIntegerToBuffer(OS, HeaderSize); // Size of the bundle (adds to the next bundle's offset) Write8byteIntegerToBuffer(OS, MB.getBufferSize()); + BundlesInfo[T] = BundleInfo(MB.getBufferSize(), HeaderSize); HeaderSize += MB.getBufferSize(); // Size of the triple Write8byteIntegerToBuffer(OS, T.size()); @@ -351,6 +361,7 @@ } Error WriteBundleStart(raw_fd_ostream &OS, StringRef TargetTriple) final { +CurWriteBundleTarget = TargetTriple.str(); return Error::success(); } @@ -359,6 +370,8 @@ } Error WriteBundle(raw_fd_ostream &OS, MemoryBuffer &Input) final { +auto BI = BundlesInfo[CurWriteBundleTarget]; +OS.seek(BI.Offset); OS.write(Input.getBufferStart(), Input.getBufferSize()); return Error::success(); } Index: clang/test/Driver/hip-toolchain-rdc.hip === --- clang/test/Driver/hip-toolchain-rdc.hip +++ clang/test/Driver/hip-toolchain-rdc.hip @@ -8,10 +8,14 @@ // RUN: --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \ // RUN: --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib2 \ // RUN: -fuse-ld=lld -fgpu-rdc -nogpuinc \ +// RUN: -fhip-dump-offload-linker-script \ // RUN: %S/Inputs/hip_multiple_inputs/a.cu \ // RUN: %S/Inputs/hip_multiple_inputs/b.hip \ // RUN: 2>&1 | FileCheck %s +// check code object alignment in dumped llvm-mc input +// CHECK: .p2align 12 + // emit objects for host side path // CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "x86_64-unknown-linux-gnu" // CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa" @@ -87,6 +91,7 @@ // combine images generated into hip fat binary object // CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o" +// CHECK-SAME: "-bundle-align=4096" // CHECK-SAME: "-targets={{.*}},hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900" // CHECK-SAME: "-inputs={{.*}},[[IMG_DEV1]],[[IMG_DEV2]]" "-outputs=[[BUNDLE:.*hipfb]]" Index: clang/test/Driver/hip-toolchain-no-rdc.hip === --- clang/test/Driver/hip-toolchain-no-rdc.hip +++ clang/test/Driver/hip-toolchain-no-rdc.hip @@ -81,6 +81,7 @@ // // CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o" +// CHECK-SAME: "-bundle-align=4096" // CHECK-SAME: "-targets={{.*}},hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900" // CHECK-SAME: "-inputs={{.*}},[[IMG_DEV_A_803]],[[IMG_DEV_A_900]]" "-outputs=[[BUNDLE_A:.*hipfb]]" @@ -143,6 +144,7 @@ // // CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o" +// CHECK-SAME: "-bundle-align=4096" // CHECK-SAME: "-targets={{.*}},hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900" // CHECK-SAME: "-inputs={{.*}},[[IMG_DEV_B_803]],[[IMG_DEV_B_900]]" "-outputs=[[BUNDLE_A:.*hipfb]]" Index: clang/test/Driver/clang-offload-bundler.c === --- clang/test/Driver/clang-offload-bundler.c +++ clang/test/Driver/clang-offload-bundler.c @@ -278,6 +278,16 @@ // RUN: diff %t.empty %t.res.tgt1 // RUN: diff %t.empty %t.res.tgt
[PATCH] D88734: [HIP] Align device binary
yaxunl marked an inline comment as done. yaxunl added inline comments. Comment at: clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp:374 +auto BI = BundlesInfo[CurWriteBundleTarget]; +OS.seek(BI.Offset); OS.write(Input.getBufferStart(), Input.getBufferSize()); tra wrote: > Does the bundler anways create the file from scratch or truncate it? > If it were to operate on existing file, seek would leave some data as is and > that may result in nondeterministic output. > It may be better to explicitly zero the padding. It will truncate the output file if it exists, therefore it should be fine to use seek. CHANGES SINCE LAST ACTION https://reviews.llvm.org/D88734/new/ https://reviews.llvm.org/D88734 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D88734: [HIP] Align device binary
tra accepted this revision. tra added inline comments. This revision is now accepted and ready to land. Comment at: clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp:374 +auto BI = BundlesInfo[CurWriteBundleTarget]; +OS.seek(BI.Offset); OS.write(Input.getBufferStart(), Input.getBufferSize()); Does the bundler anways create the file from scratch or truncate it? If it were to operate on existing file, seek would leave some data as is and that may result in nondeterministic output. It may be better to explicitly zero the padding. CHANGES SINCE LAST ACTION https://reviews.llvm.org/D88734/new/ https://reviews.llvm.org/D88734 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D88734: [HIP] Align device binary
yaxunl created this revision. yaxunl added reviewers: tra, ashi1. yaxunl requested review of this revision. Herald added a reviewer: jdoerfert. Herald added a subscriber: sstefan1. To facilitate faster loading of device binaries and share them among processes, HIP runtime favors their alignment being 4096 bytes. HIP runtime can load unaligned device binaries, however, aligning them at 4096 bytes results in faster loading and less shared memory usage. This patch adds an option -bundle-align to clang-offload-bundler which allows bundles to be aligned at specified alignment. By default it is 1, which is NFC compared to existing format. This patch then aligns embedded fat binary and device binary inside fat binary at 4096 bytes. It has been verified this change does not cause significant overall file size increase for typical HIP applications (less than 1%). https://reviews.llvm.org/D88734 Files: clang/lib/CodeGen/CGCUDANV.cpp clang/lib/Driver/ToolChains/HIP.cpp clang/test/CodeGenCUDA/device-stub.cu clang/test/Driver/clang-offload-bundler.c clang/test/Driver/hip-toolchain-no-rdc.hip clang/test/Driver/hip-toolchain-rdc.hip clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp Index: clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp === --- clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp +++ clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp @@ -94,6 +94,11 @@ "instead of actually executing them - for testing purposes.\n"), cl::init(false), cl::cat(ClangOffloadBundlerCategory)); +static cl::opt +BundleAlignment("bundle-align", +cl::desc("Alignment of bundle for binary files"), +cl::init(1), cl::cat(ClangOffloadBundlerCategory)); + /// Magic string that marks the existence of offloading data. #define OFFLOAD_BUNDLER_MAGIC_STR "__CLANG_OFFLOAD_BUNDLE__" @@ -223,6 +228,9 @@ StringMap::iterator CurBundleInfo; StringMap::iterator NextBundleInfo; + /// Current bundle target to be written. + std::string CurWriteBundleTarget; + public: BinaryFileHandler() : FileHandler() {} @@ -337,10 +345,12 @@ unsigned Idx = 0; for (auto &T : TargetNames) { MemoryBuffer &MB = *Inputs[Idx++]; + HeaderSize = alignTo(HeaderSize, BundleAlignment); // Bundle offset. Write8byteIntegerToBuffer(OS, HeaderSize); // Size of the bundle (adds to the next bundle's offset) Write8byteIntegerToBuffer(OS, MB.getBufferSize()); + BundlesInfo[T] = BundleInfo(MB.getBufferSize(), HeaderSize); HeaderSize += MB.getBufferSize(); // Size of the triple Write8byteIntegerToBuffer(OS, T.size()); @@ -351,6 +361,7 @@ } Error WriteBundleStart(raw_fd_ostream &OS, StringRef TargetTriple) final { +CurWriteBundleTarget = TargetTriple.str(); return Error::success(); } @@ -359,6 +370,8 @@ } Error WriteBundle(raw_fd_ostream &OS, MemoryBuffer &Input) final { +auto BI = BundlesInfo[CurWriteBundleTarget]; +OS.seek(BI.Offset); OS.write(Input.getBufferStart(), Input.getBufferSize()); return Error::success(); } Index: clang/test/Driver/hip-toolchain-rdc.hip === --- clang/test/Driver/hip-toolchain-rdc.hip +++ clang/test/Driver/hip-toolchain-rdc.hip @@ -8,10 +8,14 @@ // RUN: --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \ // RUN: --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib2 \ // RUN: -fuse-ld=lld -fgpu-rdc -nogpuinc \ +// RUN: -fhip-dump-offload-linker-script \ // RUN: %S/Inputs/hip_multiple_inputs/a.cu \ // RUN: %S/Inputs/hip_multiple_inputs/b.hip \ // RUN: 2>&1 | FileCheck %s +// check code object alignment in dumped llvm-mc input +// CHECK: .p2align 12 + // emit objects for host side path // CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "x86_64-unknown-linux-gnu" // CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa" @@ -87,6 +91,7 @@ // combine images generated into hip fat binary object // CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o" +// CHECK-SAME: "-bundle-align=4096" // CHECK-SAME: "-targets={{.*}},hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900" // CHECK-SAME: "-inputs={{.*}},[[IMG_DEV1]],[[IMG_DEV2]]" "-outputs=[[BUNDLE:.*hipfb]]" Index: clang/test/Driver/hip-toolchain-no-rdc.hip === --- clang/test/Driver/hip-toolchain-no-rdc.hip +++ clang/test/Driver/hip-toolchain-no-rdc.hip @@ -81,6 +81,7 @@ // // CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o" +// CHECK-SAME: "-bundle-align=4096" // CHECK-SAME: "-targets={{.*}},hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900" // CHECK-SAME: "-inputs={{.*}},[[IMG_DEV_A_803]],[[IMG_DEV_A_900]]" "-outputs=[[BUNDLE_A:.*hipfb]]" @@ -143,6 +144,7 @@ // // CHECK: [[BUNDLER:".*clang-offload-bundler"]]