https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109811

--- Comment #5 from Jan Hubicka <hubicka at gcc dot gnu.org> ---
Also forgot to mention, I used zen3 machine.  So Raptor lake is not necessary.
Note that build systems appends -O2 after any CFLAGS specified, so it really is
-O2 build:

# Force build with optimizations in release mode.
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O2")

For Clang other options are appended:

      -fnew-alignment=8
      -fno-cxx-exceptions
      -fno-slp-vectorize
      -fno-vectorize

      -disable-free
      -disable-llvm-verifier


Perf profile mixing both GCC and clang build is:

   8.36%  cjxl     libjxl.so.0.7.0          [.] jxl::(anonymous
namespace)::FindTextLikePatches                                                
                                                                               
                                       ◆
   5.74%  cjxl     libjxl.so.0.7.0          [.] jxl::FindBestPatchDictionary   
                                                                               
                                                                               
                       ▒
   4.51%  cjxl     libjxl.so.0.7.0          [.] jxl::N_AVX2::EstimateEntropy   
                                                                               
                                                                               
                       ▒
   4.50%  cjxl     libjxl.so.0.7.0          [.] jxl::N_AVX2::(anonymous
namespace)::TransformFromPixels                                                
                                                                               
                               ▒
   4.25%  cjxl     libjxl.so.0.7.0          [.] jxl::N_AVX2::QuantizeBlockAC   
                                                                               
                                                                               
                       ▒
   4.10%  cjxl     libjxl.so.0.7.0          [.] jxl::N_AVX2::EstimateEntropy   
                                                                               
                                                                               
                       ▒
   3.77%  cjxl     libjxl.so.0.7.0          [.] jxl::N_AVX2::(anonymous
namespace)::TransformFromPixels                                                
                                                                               
                               ▒
   3.46%  cjxl     libjxl.so.0.7.0          [.] jxl::N_AVX2::QuantizeBlockAC   
                                                                               
                                                                               
                       ▒
   3.08%  cjxl     libjxl.so.0.7.0          [.] jxl::N_AVX2::FindBestMultiplier
                                                                               
                                                                               
                       ▒
   3.04%  cjxl     libjxl.so.0.7.0          [.] jxl::N_AVX2::FindBestMultiplier
                                                                               
                                                                               
                       ▒
   2.98%  cjxl     libjxl.so.0.7.0          [.] jxl::N_AVX2::(anonymous
namespace)::DCT1DImpl<8ul, 8ul>::operator()                                    
                                                                               
                               ▒
   2.80%  cjxl     libjxl.so.0.7.0          [.]
jxl::ThreadPool::RunCallState<jxl::Status (unsigned long),
jxl::N_AVX2::Symmetric5(jxl::Plane<float> const&, jxl::RectT<unsigned long>
const&, jxl::WeightsSymmetric5 const&, jxl::ThreadPool*,
jxl::Plane<float>*)::{l▒
   2.75%  cjxl     libjxl.so.0.7.0          [.]
jxl::ThreadPool::RunCallState<jxl::Status (unsigned long),
jxl::N_AVX2::Symmetric5(jxl::Plane<float> const&, jxl::RectT<unsigned long>
const&, jxl::WeightsSymmetric5 const&, jxl::ThreadPool*,
jxl::Plane<float>*)::$_▒
   2.26%  cjxl     libjxl.so.0.7.0          [.]
jxl::ThreadPool::RunCallState<jxl::Status (unsigned long),
jxl::N_AVX2::SRGBToXYB(jxl::Image3<float> const&, float const*,
jxl::ThreadPool*, jxl::Image3<float>*)::$_0>::CallDataFunc                     
            ▒
   2.00%  cjxl     libjxl.so.0.7.0          [.] jxl::N_AVX2::(anonymous
namespace)::DCT1DWrapper<4ul, 4ul, jxl::N_AVX2::(anonymous namespace)::DCTFrom,
jxl::N_AVX2::(anonymous namespace)::DCTTo>                                     
                               ▒
   1.95%  cjxl     libjxl.so.0.7.0          [.] jxl::N_AVX2::(anonymous
namespace)::DCT1DImpl<16ul, 8ul>::operator()                                   
                                                                               
                               ▒
   1.68%  cjxl     libjxl.so.0.7.0          [.]
jxl::ThreadPool::RunCallState<jxl::Status (unsigned long),
jxl::ConvertFromExternal(jxl::Span<unsigned char const>, unsigned long,
unsigned long, jxl::ColorEncoding const&, unsigned long, bool, unsigned long,
JxlEnd▒
   1.68%  cjxl     libjxl.so.0.7.0          [.] jxl::N_AVX2::(anonymous
namespace)::DCT1DImpl<32ul, 8ul>::operator()                                   
                                                                               
                               ▒
   1.66%  cjxl     libjxl.so.0.7.0          [.]
jxl::ThreadPool::RunCallState<jxl::Status (unsigned long),
jxl::ConvertFromExternal(jxl::Span<unsigned char const>, unsigned long,
unsigned long, jxl::ColorEncoding const&, unsigned long, bool, unsigned long,
JxlEnd▒
   1.56%  cjxl     libjxl.so.0.7.0          [.] jxl::N_AVX2::(anonymous
namespace)::DCT1DWrapper<8ul, 4ul, jxl::N_AVX2::(anonymous namespace)::DCTFrom,
jxl::N_AVX2::(anonymous namespace)::DCTTo>                                     
                               ▒
   1.52%  cjxl     libjxl.so.0.7.0          [.] jxl::N_AVX2::(anonymous
namespace)::DCT1DImpl<32ul, 8ul>::operator()                                   
                                                                               
                               ▒
   1.33%  cjxl     libjxl.so.0.7.0          [.]
jxl::ThreadPool::RunCallState<jxl::N_AVX2::(anonymous
namespace)::AdaptiveQuantizationMap(float, jxl::Image3<float> const&,
jxl::FrameDimensions const&, float, jxl::ThreadPool*, jxl::Plane<float>*)::$_0,
jxl::N_AVX2▒
   1.27%  cjxl     libjxl.so.0.7.0          [.] jxl::N_AVX2::(anonymous
namespace)::DCT1DWrapper<64ul, 0ul, jxl::N_AVX2::(anonymous
namespace)::DCTFrom, jxl::N_AVX2::(anonymous namespace)::DCTTo>                
                                                   ▒
   1.11%  cjxl     libjxl.so.0.7.0          [.]
jxl::ThreadPool::RunCallState<jxl::Status (unsigned long), jxl::(anonymous
namespace)::FindTextLikePatches(jxl::Image3<float> const&,
jxl::PassesEncoderState const*, jxl::ThreadPool*, jxl::AuxOut*,
bool)::{lambda(un▒

So it is some hand written AVX code.  In GCC top function is
FindTextLikePatches
while clang FindBestPatchDictionary.  We  do not inline it because of large
function growth limit. Adding --param large-function-insns=1000000 makes
inlining decisions to match and has no effect on the performance.

With these changes I get:
   8.42%  cjxl     libjxl.so.0.7.0          [.] jxl::FindBestPatchDictionary   
                                                                               
                                                                               
                       ◆
   5.72%  cjxl     libjxl.so.0.7.0          [.] jxl::FindBestPatchDictionary   
                                                                               
                                                                               
                       ▒
   4.50%  cjxl     libjxl.so.0.7.0          [.] jxl::N_AVX2::(anonymous
namespace)::TransformFromPixels                                                
                                                                               
                               ▒
   4.46%  cjxl     libjxl.so.0.7.0          [.] jxl::N_AVX2::EstimateEntropy   
                                                                               
                                                                               
                       ▒
   4.25%  cjxl     libjxl.so.0.7.0          [.] jxl::N_AVX2::QuantizeBlockAC   
                                                                               
                                                                               
                       ▒
   4.14%  cjxl     libjxl.so.0.7.0          [.] jxl::N_AVX2::EstimateEntropy   
                                                                               
                                                                               
                       ▒
   3.76%  cjxl     libjxl.so.0.7.0          [.] jxl::N_AVX2::(anonymous
namespace)::TransformFromPixels                                                
                                                                               
                               ▒
   3.56%  cjxl     libjxl.so.0.7.0          [.] jxl::N_AVX2::QuantizeBlockAC   
                                                                               
                                                                               
                       ▒
   3.10%  cjxl     libjxl.so.0.7.0          [.] jxl::N_AVX2::FindBestMultiplier
                                                                               
                                                                               
                       ▒
   3.00%  cjxl     libjxl.so.0.7.0          [.] jxl::N_AVX2::FindBestMultiplier
                                                                               
                                                                               
                       ▒
   2.98%  cjxl     libjxl.so.0.7.0          [.] jxl::N_AVX2::(anonymous
namespace)::DCT1DImpl<8ul, 8ul>::operator()                                    
                                                                               
                               ▒
   2.82%  cjxl     libjxl.so.0.7.0          [.]
jxl::ThreadPool::RunCallState<jxl::Status (unsigned long),
jxl::N_AVX2::Symmetric5(jxl::Plane<float> const&, jxl::RectT<unsigned long>
const&, jxl::WeightsSymmetric5 const&, jxl::ThreadPool*,
jxl::Plane<float>*)::{l▒
   2.75%  cjxl     libjxl.so.0.7.0          [.]
jxl::ThreadPool::RunCallState<jxl::Status (unsigned long),
jxl::N_AVX2::Symmetric5(jxl::Plane<float> const&, jxl::RectT<unsigned long>
const&, jxl::WeightsSymmetric5 const&, jxl::ThreadPool*,
jxl::Plane<float>*)::$_▒
   2.26%  cjxl     libjxl.so.0.7.0          [.]
jxl::ThreadPool::RunCallState<jxl::Status (unsigned long),
jxl::N_AVX2::SRGBToXYB(jxl::Image3<float> const&, float const*,
jxl::ThreadPool*, jxl::Image3<float>*)::$_0>::CallDataFunc                     
            ▒
   1.99%  cjxl     libjxl.so.0.7.0          [.] jxl::N_AVX2::(anonymous
namespace)::DCT1DWrapper<4ul, 4ul, jxl::N_AVX2::(anonymous namespace)::DCTFrom,
jxl::N_AVX2::(anonymous namespace)::DCTTo>                                     
                               ▒
   1.95%  cjxl     libjxl.so.0.7.0          [.] jxl::N_AVX2::(anonymous
namespace)::DCT1DImpl<16ul, 8ul>::operator()                                   
                                                                               
                               ▒
   1.69%  cjxl     libjxl.so.0.7.0          [.]
jxl::ThreadPool::RunCallState<jxl::Status (unsigned long),
jxl::ConvertFromExternal(jxl::Span<unsigned char const>, unsigned long,
unsigned long, jxl::ColorEncoding const&, unsigned long, bool, unsigned long,
JxlEnd▒
   1.67%  cjxl     libjxl.so.0.7.0          [.]
jxl::ThreadPool::RunCallState<jxl::Status (unsigned long),
jxl::ConvertFromExternal(jxl::Span<unsigned char const>, unsigned long,
unsigned long, jxl::ColorEncoding const&, unsigned long, bool, unsigned long,
JxlEnd▒
   1.66%  cjxl     libjxl.so.0.7.0          [.] jxl::N_AVX2::(anonymous
namespace)::DCT1DImpl<32ul, 8ul>::operator()                                   
                                                                               
                               ▒
   1.54%  cjxl     libjxl.so.0.7.0          [.] jxl::N_AVX2::(anonymous
namespace)::DCT1DWrapper<8ul, 4ul, jxl::N_AVX2::(anonymous namespace)::DCTFrom,
jxl::N_AVX2::(anonymous namespace)::DCTTo>                                     
                               ▒
   1.49%  cjxl     libjxl.so.0.7.0          [.] jxl::N_AVX2::(anonymous
namespace)::DCT1DImpl<32ul, 8ul>::operator()                                   
                                                                               
                               ▒
   1.34%  cjxl     libjxl.so.0.7.0          [.]
jxl::ThreadPool::RunCallState<jxl::N_AVX2::(anonymous
namespace)::AdaptiveQuantizationMap(float, jxl::Image3<float> const&,
jxl::FrameDimensions const&, float, jxl::ThreadPool*, jxl::Plane<float>*)::$_0,
jxl::N_AVX2▒
   1.27%  cjxl     libjxl.so.0.7.0          [.] jxl::N_AVX2::(anonymous
namespace)::DCT1DWrapper<64ul, 0ul, jxl::N_AVX2::(anonymous
namespace)::DCTFrom, jxl::N_AVX2::(anonymous namespace)::DCTTo>                
                                                   ▒
   1.16%  cjxl     libjxl.so.0.7.0          [.]
jxl::ThreadPool::RunCallState<jxl::Status (unsigned long), jxl::(anonymous
namespace)::FindTextLikePatches(jxl::Image3<float> const&,
jxl::PassesEncoderState const*, jxl::ThreadPool*, jxl::AuxOut*,
bool)::{lambda(un▒
   1.07%  cjxl     libjxl.so.0.7.0          [.]
jxl::ThreadPool::RunCallState<jxl::Status (unsigned long), jxl::(anonymous
namespace)::FindTextLikePatches(jxl::Image3<float> const&,
jxl::PassesEncoderState const*, jxl::ThreadPool*, jxl::AuxOut*,
bool)::$_0>::Call▒

Reply via email to