Amir updated this revision to Diff 481138. Amir added a comment. Herald added a project: LLVM. Herald added a subscriber: llvm-commits.
Documentation Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D139496/new/ https://reviews.llvm.org/D139496 Files: clang/CMakeLists.txt clang/cmake/caches/BOLT.cmake llvm/docs/AdvancedBuilds.rst
Index: llvm/docs/AdvancedBuilds.rst =================================================================== --- llvm/docs/AdvancedBuilds.rst +++ llvm/docs/AdvancedBuilds.rst @@ -241,6 +241,56 @@ $ ninja stage2-clang-bolt +BOLT profile +------------ +BOLT uses the profile collected by either Linux `perf` or via BOLT's own +instrumentation. Both modes are supported by CMake automation, with +instrumentation being the default in `BOLT.cmake` and `BOLT-PGO.cmake`. + +It's strongly recommended to use `perf` if host system supports it as it +is a significantly faster and potentially more reliable method: + +.. code-block:: console + + $ cmake <...> -DCLANG_BOLT_PERF=ON \ + -C <path to source>/clang/cmake/caches/BOLT.cmake + +If the host system supports profiling branch stacks (e.g. AMD or Intel LBR +(Last Branch Record), Armv9-A BRBE (Branch Record Buffer Extension)), it can be +enabled with `-DCLANG_BOLT_PERF_LBR` to further improve the profile quality: + +.. code-block:: console + + $ cmake <...> -DCLANG_BOLT_PERF=ON -DCLANG_BOLT_PERF_LBR=ON \ + -C <path to source>/clang/cmake/caches/BOLT.cmake + +The following matrix describes supported profiling methods. Note that Linux/ELF +is the only supported platform. + +============ =============== ========== =================== +Architecture Instrumentation Linux perf Linux perf with LBR +============ =============== ========== =================== +x86_64 Yes Yes Yes +AArch64 No Yes Not tested +============ =============== ========== =================== + +Profiling targets +----------------- +BOLT profile is collected from building one of in-tree projects/targets with +Clang as a workload. The following configuration options can be used to change +the profiling build: + +**CLANG_BOLT_PROJECTS** + Projects to enable in profiling build. Defaults to `llvm`. + +**CLANG_BOLT_TARGETS** + Targets to build in profiling build. Defaults to `count` in instrumentation + build and `FileCheck` in perf-build. + +**CLANG_BOLT_EXTRA_CMAKE_FLAGS** + Extra CMake flags to pass to profiling build at configuration time. + + 3-Stage Non-Determinism ======================= Index: clang/cmake/caches/BOLT.cmake =================================================================== --- clang/cmake/caches/BOLT.cmake +++ clang/cmake/caches/BOLT.cmake @@ -1,15 +1,17 @@ set(CMAKE_BUILD_TYPE Release CACHE STRING "") set(CLANG_BOLT_INSTRUMENT ON CACHE BOOL "") -set(CLANG_BOLT_INSTRUMENT_PROJECTS "llvm" CACHE STRING "") -set(CLANG_BOLT_INSTRUMENT_TARGETS "count" CACHE STRING "") +set(CLANG_BOLT_PERF OFF CACHE BOOL "") +set(CLANG_BOLT_PERF_LBR OFF CACHE BOOL "") + +set(CLANG_BOLT_PROJECTS "llvm" CACHE STRING "") +if (CLANG_BOLT_PERF) + set(CLANG_BOLT_INSTRUMENT OFF CACHE BOOL "" FORCE) + set(CLANG_BOLT_TARGETS "FileCheck" CACHE STRING "") +else() + set(CLANG_BOLT_TARGETS "count" CACHE STRING "") +endif() set(CMAKE_EXE_LINKER_FLAGS "-Wl,--emit-relocs,-znow" CACHE STRING "") -set(CLANG_BOLT_INSTRUMENT_EXTRA_CMAKE_FLAGS "" CACHE STRING "") +set(CLANG_BOLT_EXTRA_CMAKE_FLAGS "" CACHE STRING "") set(LLVM_ENABLE_PROJECTS "bolt;clang" CACHE STRING "") set(LLVM_TARGETS_TO_BUILD Native CACHE STRING "") - -# Disable function splitting enabled by default in GCC8+ -if("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-reorder-blocks-and-partition") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-reorder-blocks-and-partition") -endif() Index: clang/CMakeLists.txt =================================================================== --- clang/CMakeLists.txt +++ clang/CMakeLists.txt @@ -869,67 +869,106 @@ endforeach() endif() -if (CLANG_BOLT_INSTRUMENT AND NOT LLVM_BUILD_INSTRUMENTED) +if (CLANG_BOLT_INSTRUMENT OR CLANG_BOLT_PERF AND NOT LLVM_BUILD_INSTRUMENTED) set(CLANG_PATH ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang) set(CLANGXX_PATH ${CLANG_PATH}++) - set(CLANG_INSTRUMENTED ${CLANG_PATH}-bolt.inst) - set(CLANGXX_INSTRUMENTED ${CLANGXX_PATH}-bolt.inst) set(CLANG_OPTIMIZED ${CLANG_PATH}-bolt) set(CLANGXX_OPTIMIZED ${CLANGXX_PATH}-bolt) - # Instrument clang with BOLT - add_custom_target(clang-instrumented - DEPENDS ${CLANG_INSTRUMENTED} - ) - add_custom_command(OUTPUT ${CLANG_INSTRUMENTED} - DEPENDS clang llvm-bolt - COMMAND llvm-bolt ${CLANG_PATH} -o ${CLANG_INSTRUMENTED} - -instrument --instrumentation-file-append-pid - --instrumentation-file=${CMAKE_CURRENT_BINARY_DIR}/prof.fdata - COMMENT "Instrumenting clang binary with BOLT" - VERBATIM - ) + if (CLANG_BOLT_INSTRUMENT) + set(CLANG_INSTRUMENTED ${CLANG_PATH}-bolt.inst) + set(CLANGXX_INSTRUMENTED ${CLANGXX_PATH}-bolt.inst) - # Make a symlink from clang-bolt.inst to clang++-bolt.inst - add_custom_target(clang++-instrumented - DEPENDS ${CLANGXX_INSTRUMENTED} - ) - add_custom_command(OUTPUT ${CLANGXX_INSTRUMENTED} - DEPENDS clang-instrumented - COMMAND ${CMAKE_COMMAND} -E create_symlink - ${CLANG_INSTRUMENTED} - ${CLANGXX_INSTRUMENTED} - COMMENT "Creating symlink from BOLT instrumented clang to clang++" - VERBATIM - ) + # Instrument clang with BOLT + add_custom_target(clang-instrumented + DEPENDS ${CLANG_INSTRUMENTED} + ) + add_custom_command(OUTPUT ${CLANG_INSTRUMENTED} + DEPENDS clang llvm-bolt + COMMAND llvm-bolt ${CLANG_PATH} -o ${CLANG_INSTRUMENTED} + -instrument --instrumentation-file-append-pid + --instrumentation-file=${CMAKE_CURRENT_BINARY_DIR}/prof.fdata + COMMAND ${CMAKE_COMMAND} -E create_symlink + ${CLANG_INSTRUMENTED} + ${CLANGXX_INSTRUMENTED} + COMMENT "Instrumenting clang binary with BOLT" + VERBATIM + ) + endif() + + # Set variables for profile collection step + if (CLANG_BOLT_INSTRUMENT) + set(CLANG_BOLT_CC ${CLANG_INSTRUMENTED}) + set(CLANG_BOLT_CXX ${CLANGXX_INSTRUMENTED}) + else() # CLANG_BOLT_PERF + set(CLANG_BOLT_CC ${CLANG_PATH}) + set(CLANG_BOLT_CXX ${CLANGXX_PATH}) + + # Perf sampling: + # - use maximum frequency to reduce training time + # - use cycle events instead of branches - empirically found to produce + # better results + # - if available, enable taken branch stack/LBR sampling + # (-j/--branch-filter) + set(PERF_CMDLINE + perf record --event=cycles:u + --output=${CMAKE_CURRENT_BINARY_DIR}/prof.data + --freq=max + ) + if (CLANG_BOLT_PERF_LBR) + list(APPEND PERF_CMDLINE --branch-filter=any,u) + endif() + list(APPEND PERF_CMDLINE --) + endif() + + # Build specified targets to collect the profile + add_custom_target(bolt-profile-deps) + if (CLANG_BOLT_INSTRUMENT) + add_dependencies(bolt-profile-deps clang-instrumented) + set(CLANG_BOLT_PROFILE ${CMAKE_CURRENT_BINARY_DIR}/prof.fdata) + else() + add_dependencies(bolt-profile-deps clang) + set(CLANG_BOLT_PROFILE ${CMAKE_CURRENT_BINARY_DIR}/prof.data) + endif() + set(STAMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/bolt-clang-stamps/) + set(BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/bolt-clang-bins/) + add_custom_target(bolt-clang-clear + DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/bolt-clang-cleared + ) + add_custom_command( + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/bolt-clang-cleared + DEPENDS bolt-profile-deps + COMMAND ${CMAKE_COMMAND} -E remove_directory ${BINARY_DIR} + COMMAND ${CMAKE_COMMAND} -E make_directory ${BINARY_DIR} + COMMAND ${CMAKE_COMMAND} -E remove_directory ${STAMP_DIR} + COMMAND ${CMAKE_COMMAND} -E make_directory ${STAMP_DIR} + COMMENT "Clobberring bolt-clang build and stamp directories" + ) - # Build specified targets with instrumented Clang to collect the profile - set(STAMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/bolt-instrumented-clang-stamps/) - set(BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/bolt-instrumented-clang-bins/) set(build_configuration "$<CONFIG>") include(ExternalProject) - ExternalProject_Add(bolt-instrumentation-profile - DEPENDS clang++-instrumented - PREFIX bolt-instrumentation-profile + ExternalProject_Add(bolt-profile + DEPENDS bolt-profile-deps + PREFIX bolt-profile SOURCE_DIR ${CMAKE_SOURCE_DIR} STAMP_DIR ${STAMP_DIR} BINARY_DIR ${BINARY_DIR} EXCLUDE_FROM_ALL 1 CMAKE_ARGS - ${CLANG_BOLT_INSTRUMENT_EXTRA_CMAKE_FLAGS} + ${CLANG_BOLT_EXTRA_CMAKE_FLAGS} # We shouldn't need to set this here, but INSTALL_DIR doesn't # seem to work, so instead I'm passing this through -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX} - -DCMAKE_C_COMPILER=${CLANG_INSTRUMENTED} - -DCMAKE_CXX_COMPILER=${CLANGXX_INSTRUMENTED} - -DCMAKE_ASM_COMPILER=${CLANG_INSTRUMENTED} + -DCMAKE_C_COMPILER=${CLANG_BOLT_CC} + -DCMAKE_CXX_COMPILER=${CLANG_BOLT_CXX} + -DCMAKE_ASM_COMPILER=${CLANG_BOLT_CC} -DCMAKE_ASM_COMPILER_ID=Clang - -DCMAKE_BUILD_TYPE=Release - -DLLVM_ENABLE_PROJECTS=${CLANG_BOLT_INSTRUMENT_PROJECTS} + -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} + -DLLVM_ENABLE_PROJECTS=${CLANG_BOLT_PROJECTS} -DLLVM_TARGETS_TO_BUILD=${LLVM_TARGETS_TO_BUILD} - BUILD_COMMAND ${CMAKE_COMMAND} --build ${BINARY_DIR} + BUILD_COMMAND ${PERF_CMDLINE} ${CMAKE_COMMAND} --build ${BINARY_DIR} --config ${build_configuration} - --target ${CLANG_BOLT_INSTRUMENT_TARGETS} + --target ${CLANG_BOLT_TARGETS} INSTALL_COMMAND "" STEP_TARGETS configure build USES_TERMINAL_CONFIGURE 1 @@ -937,21 +976,31 @@ USES_TERMINAL_INSTALL 1 ) - # Merge profiles into one using merge-fdata add_custom_target(clang-bolt-profile - DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/prof.fdata - ) - add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/prof.fdata - DEPENDS merge-fdata bolt-instrumentation-profile-build - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} - COMMAND ${Python3_EXECUTABLE} - ${CMAKE_CURRENT_SOURCE_DIR}/utils/perf-training/perf-helper.py merge-fdata - $<TARGET_FILE:merge-fdata> ${CMAKE_CURRENT_BINARY_DIR}/prof.fdata - ${CMAKE_CURRENT_BINARY_DIR} - COMMENT "Preparing BOLT profile" - VERBATIM + DEPENDS ${CLANG_BOLT_PROFILE} ) + if (CLANG_BOLT_INSTRUMENT) + # Merge profiles into one using merge-fdata + add_custom_command(OUTPUT ${CLANG_BOLT_PROFILE} + DEPENDS bolt-profile-build merge-fdata + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + COMMAND ${Python3_EXECUTABLE} + ${CMAKE_CURRENT_SOURCE_DIR}/utils/perf-training/perf-helper.py merge-fdata + $<TARGET_FILE:merge-fdata> prof.fdata . + COMMENT "Preparing BOLT profile" + VERBATIM + ) + else() # CLANG_BOLT_PERF + # Profile is produced by perf by running the build + add_custom_command(OUTPUT ${CLANG_BOLT_PROFILE} + DEPENDS bolt-profile-build + ) + endif() + # Pass extra flag in no-LBR mode + if (CLANG_BOLT_PERF AND NOT CLANG_BOLT_PERF_LBR) + set(CLANG_BOLT_NO_LBR "-nl") + endif() # Optimize original (pre-bolt) Clang using the collected profile add_custom_target(clang-bolt DEPENDS ${CLANG_OPTIMIZED} @@ -960,9 +1009,10 @@ DEPENDS clang-bolt-profile COMMAND llvm-bolt ${CLANG_PATH} -o ${CLANG_OPTIMIZED} - -data ${CMAKE_CURRENT_BINARY_DIR}/prof.fdata + -data ${CLANG_BOLT_PROFILE} -reorder-blocks=ext-tsp -reorder-functions=hfsort+ -split-functions - -split-all-cold -split-eh -dyno-stats -icf=1 -use-gnu-stack + -split-all-cold -split-eh -dyno-stats -icf=1 -use-gnu-stack -plt=hot + ${CLANG_BOLT_NO_LBR} COMMAND ${CMAKE_COMMAND} -E rename ${CLANG_OPTIMIZED} ${CLANG_PATH}-${CLANG_VERSION_MAJOR} COMMENT "Optimizing Clang with BOLT" VERBATIM
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits