https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109811
--- Comment #5 from Jan Hubicka <hubicka at gcc dot gnu.org> --- Also forgot to mention, I used zen3 machine. So Raptor lake is not necessary. Note that build systems appends -O2 after any CFLAGS specified, so it really is -O2 build: # Force build with optimizations in release mode. set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O2") For Clang other options are appended: -fnew-alignment=8 -fno-cxx-exceptions -fno-slp-vectorize -fno-vectorize -disable-free -disable-llvm-verifier Perf profile mixing both GCC and clang build is: 8.36% cjxl libjxl.so.0.7.0 [.] jxl::(anonymous namespace)::FindTextLikePatches ◆ 5.74% cjxl libjxl.so.0.7.0 [.] jxl::FindBestPatchDictionary ▒ 4.51% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::EstimateEntropy ▒ 4.50% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::(anonymous namespace)::TransformFromPixels ▒ 4.25% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::QuantizeBlockAC ▒ 4.10% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::EstimateEntropy ▒ 3.77% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::(anonymous namespace)::TransformFromPixels ▒ 3.46% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::QuantizeBlockAC ▒ 3.08% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::FindBestMultiplier ▒ 3.04% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::FindBestMultiplier ▒ 2.98% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::(anonymous namespace)::DCT1DImpl<8ul, 8ul>::operator() ▒ 2.80% cjxl libjxl.so.0.7.0 [.] jxl::ThreadPool::RunCallState<jxl::Status (unsigned long), jxl::N_AVX2::Symmetric5(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, jxl::WeightsSymmetric5 const&, jxl::ThreadPool*, jxl::Plane<float>*)::{l▒ 2.75% cjxl libjxl.so.0.7.0 [.] jxl::ThreadPool::RunCallState<jxl::Status (unsigned long), jxl::N_AVX2::Symmetric5(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, jxl::WeightsSymmetric5 const&, jxl::ThreadPool*, jxl::Plane<float>*)::$_▒ 2.26% cjxl libjxl.so.0.7.0 [.] jxl::ThreadPool::RunCallState<jxl::Status (unsigned long), jxl::N_AVX2::SRGBToXYB(jxl::Image3<float> const&, float const*, jxl::ThreadPool*, jxl::Image3<float>*)::$_0>::CallDataFunc ▒ 2.00% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::(anonymous namespace)::DCT1DWrapper<4ul, 4ul, jxl::N_AVX2::(anonymous namespace)::DCTFrom, jxl::N_AVX2::(anonymous namespace)::DCTTo> ▒ 1.95% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::(anonymous namespace)::DCT1DImpl<16ul, 8ul>::operator() ▒ 1.68% cjxl libjxl.so.0.7.0 [.] jxl::ThreadPool::RunCallState<jxl::Status (unsigned long), jxl::ConvertFromExternal(jxl::Span<unsigned char const>, unsigned long, unsigned long, jxl::ColorEncoding const&, unsigned long, bool, unsigned long, JxlEnd▒ 1.68% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::(anonymous namespace)::DCT1DImpl<32ul, 8ul>::operator() ▒ 1.66% cjxl libjxl.so.0.7.0 [.] jxl::ThreadPool::RunCallState<jxl::Status (unsigned long), jxl::ConvertFromExternal(jxl::Span<unsigned char const>, unsigned long, unsigned long, jxl::ColorEncoding const&, unsigned long, bool, unsigned long, JxlEnd▒ 1.56% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::(anonymous namespace)::DCT1DWrapper<8ul, 4ul, jxl::N_AVX2::(anonymous namespace)::DCTFrom, jxl::N_AVX2::(anonymous namespace)::DCTTo> ▒ 1.52% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::(anonymous namespace)::DCT1DImpl<32ul, 8ul>::operator() ▒ 1.33% cjxl libjxl.so.0.7.0 [.] jxl::ThreadPool::RunCallState<jxl::N_AVX2::(anonymous namespace)::AdaptiveQuantizationMap(float, jxl::Image3<float> const&, jxl::FrameDimensions const&, float, jxl::ThreadPool*, jxl::Plane<float>*)::$_0, jxl::N_AVX2▒ 1.27% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::(anonymous namespace)::DCT1DWrapper<64ul, 0ul, jxl::N_AVX2::(anonymous namespace)::DCTFrom, jxl::N_AVX2::(anonymous namespace)::DCTTo> ▒ 1.11% cjxl libjxl.so.0.7.0 [.] jxl::ThreadPool::RunCallState<jxl::Status (unsigned long), jxl::(anonymous namespace)::FindTextLikePatches(jxl::Image3<float> const&, jxl::PassesEncoderState const*, jxl::ThreadPool*, jxl::AuxOut*, bool)::{lambda(un▒ So it is some hand written AVX code. In GCC top function is FindTextLikePatches while clang FindBestPatchDictionary. We do not inline it because of large function growth limit. Adding --param large-function-insns=1000000 makes inlining decisions to match and has no effect on the performance. With these changes I get: 8.42% cjxl libjxl.so.0.7.0 [.] jxl::FindBestPatchDictionary ◆ 5.72% cjxl libjxl.so.0.7.0 [.] jxl::FindBestPatchDictionary ▒ 4.50% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::(anonymous namespace)::TransformFromPixels ▒ 4.46% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::EstimateEntropy ▒ 4.25% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::QuantizeBlockAC ▒ 4.14% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::EstimateEntropy ▒ 3.76% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::(anonymous namespace)::TransformFromPixels ▒ 3.56% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::QuantizeBlockAC ▒ 3.10% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::FindBestMultiplier ▒ 3.00% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::FindBestMultiplier ▒ 2.98% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::(anonymous namespace)::DCT1DImpl<8ul, 8ul>::operator() ▒ 2.82% cjxl libjxl.so.0.7.0 [.] jxl::ThreadPool::RunCallState<jxl::Status (unsigned long), jxl::N_AVX2::Symmetric5(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, jxl::WeightsSymmetric5 const&, jxl::ThreadPool*, jxl::Plane<float>*)::{l▒ 2.75% cjxl libjxl.so.0.7.0 [.] jxl::ThreadPool::RunCallState<jxl::Status (unsigned long), jxl::N_AVX2::Symmetric5(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, jxl::WeightsSymmetric5 const&, jxl::ThreadPool*, jxl::Plane<float>*)::$_▒ 2.26% cjxl libjxl.so.0.7.0 [.] jxl::ThreadPool::RunCallState<jxl::Status (unsigned long), jxl::N_AVX2::SRGBToXYB(jxl::Image3<float> const&, float const*, jxl::ThreadPool*, jxl::Image3<float>*)::$_0>::CallDataFunc ▒ 1.99% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::(anonymous namespace)::DCT1DWrapper<4ul, 4ul, jxl::N_AVX2::(anonymous namespace)::DCTFrom, jxl::N_AVX2::(anonymous namespace)::DCTTo> ▒ 1.95% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::(anonymous namespace)::DCT1DImpl<16ul, 8ul>::operator() ▒ 1.69% cjxl libjxl.so.0.7.0 [.] jxl::ThreadPool::RunCallState<jxl::Status (unsigned long), jxl::ConvertFromExternal(jxl::Span<unsigned char const>, unsigned long, unsigned long, jxl::ColorEncoding const&, unsigned long, bool, unsigned long, JxlEnd▒ 1.67% cjxl libjxl.so.0.7.0 [.] jxl::ThreadPool::RunCallState<jxl::Status (unsigned long), jxl::ConvertFromExternal(jxl::Span<unsigned char const>, unsigned long, unsigned long, jxl::ColorEncoding const&, unsigned long, bool, unsigned long, JxlEnd▒ 1.66% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::(anonymous namespace)::DCT1DImpl<32ul, 8ul>::operator() ▒ 1.54% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::(anonymous namespace)::DCT1DWrapper<8ul, 4ul, jxl::N_AVX2::(anonymous namespace)::DCTFrom, jxl::N_AVX2::(anonymous namespace)::DCTTo> ▒ 1.49% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::(anonymous namespace)::DCT1DImpl<32ul, 8ul>::operator() ▒ 1.34% cjxl libjxl.so.0.7.0 [.] jxl::ThreadPool::RunCallState<jxl::N_AVX2::(anonymous namespace)::AdaptiveQuantizationMap(float, jxl::Image3<float> const&, jxl::FrameDimensions const&, float, jxl::ThreadPool*, jxl::Plane<float>*)::$_0, jxl::N_AVX2▒ 1.27% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::(anonymous namespace)::DCT1DWrapper<64ul, 0ul, jxl::N_AVX2::(anonymous namespace)::DCTFrom, jxl::N_AVX2::(anonymous namespace)::DCTTo> ▒ 1.16% cjxl libjxl.so.0.7.0 [.] jxl::ThreadPool::RunCallState<jxl::Status (unsigned long), jxl::(anonymous namespace)::FindTextLikePatches(jxl::Image3<float> const&, jxl::PassesEncoderState const*, jxl::ThreadPool*, jxl::AuxOut*, bool)::{lambda(un▒ 1.07% cjxl libjxl.so.0.7.0 [.] jxl::ThreadPool::RunCallState<jxl::Status (unsigned long), jxl::(anonymous namespace)::FindTextLikePatches(jxl::Image3<float> const&, jxl::PassesEncoderState const*, jxl::ThreadPool*, jxl::AuxOut*, bool)::$_0>::Call▒