https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92825

            Bug ID: 92825
           Summary: Unnecesary stack protection and missed SLP
                    vectorization in Firefox's LightPixel.
           Product: gcc
           Version: 10.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: hubicka at gcc dot gnu.org
  Target Milestone: ---

Created attachment 47428
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=47428&action=edit
full testcase

uint32_t DiffuseLightingSoftware::LightPixel(const Point3D& aNormal,
                                             const Point3D& aVectorToLight,
                                             uint32_t aColor) {
  Float dotNL = std::max(0.0f, aNormal.DotProduct(aVectorToLight));
  Float diffuseNL = mDiffuseConstant * dotNL;

  union {
    uint32_t bgra;
    uint8_t components[4];
  } color = {aColor};
  color.components[B8G8R8A8_COMPONENT_BYTEOFFSET_B] = umin(
      uint32_t(diffuseNL * color.components[B8G8R8A8_COMPONENT_BYTEOFFSET_B]),
      255U);
  color.components[B8G8R8A8_COMPONENT_BYTEOFFSET_G] = umin(
      uint32_t(diffuseNL * color.components[B8G8R8A8_COMPONENT_BYTEOFFSET_G]),
      255U);
  color.components[B8G8R8A8_COMPONENT_BYTEOFFSET_R] = umin(
      uint32_t(diffuseNL * color.components[B8G8R8A8_COMPONENT_BYTEOFFSET_R]),
      255U);
  color.components[B8G8R8A8_COMPONENT_BYTEOFFSET_A] = 255;
  return color.bgra;
}

(full testcase attached)
Built with -O3 -fstack-protection-strong results in slower code with gcc10 than
with gcc9 or clang.

GCC produces:
       │     0000000004390e20 <mozilla::gfx::(anonymous
namespace)::SpecularLightingSoftware::LightPixel(mozilla::gfx::Point3DTyped<mozilla::gfx::UnknownUnits,
float> const&,
       │    
_ZN7mozilla3gfx12_GLOBAL__N_124SpecularLightingSoftware10LightPixelERKNS0_12Point3DTypedINS0_12UnknownUnitsEfEES7_j():
  0.19 │       push      %rbp
  0.60 │       pxor      %xmm5,%xmm5
  0.05 │       mov       %rsp,%rbp
  0.12 │       push      %rbx
  0.65 │       sub       $0x18,%rsp
  0.33 │       movss     0x4(%rdx),%xmm0
  0.10 │       movss     (%rdx),%xmm1
  0.58 │       mov       %fs:0x28,%rax
  0.03 │       mov       %rax,-0x18(%rbp)
  0.22 │       xor       %eax,%eax
  0.07 │       movss     pw_32+0x1588,%xmm3
  1.58 │       addss     0x8(%rdx),%xmm3
  0.67 │       addss     %xmm5,%xmm0
  0.23 │       addss     %xmm5,%xmm1
       │       movaps    %xmm0,%xmm2
  0.41 │       movaps    %xmm1,%xmm4
  0.87 │       mulss     %xmm0,%xmm2
  0.28 │       mulss     %xmm1,%xmm4
  3.71 │       addss     %xmm2,%xmm4
  0.14 │       movaps    %xmm3,%xmm2
  0.04 │       mulss     %xmm3,%xmm2
  1.99 │       addss     %xmm2,%xmm4
  0.15 │       movss     0x4(%rsi),%xmm2
  9.39 │       sqrtss    %xmm4,%xmm4
  8.90 │       divss     %xmm4,%xmm0
  2.10 │       divss     %xmm4,%xmm3
  1.08 │       mulss     %xmm0,%xmm2
  0.01 │       movss     0x8(%rsi),%xmm0

while clang
Percent│    
_ZN7mozilla3gfx12_GLOBAL__N_124SpecularLightingSoftware10LightPixelERKNS0_12Point3DTypedINS0_12UnknownUnitsEfEES7_j():
  0.11 │       xorps     %xmm0,%xmm0
  0.83 │       movss     0x4(%rdx),%xmm1
  3.29 │       addss     %xmm0,%xmm1
  0.03 │       movss     (%rdx),%xmm2
  0.08 │       movss     0x8(%rdx),%xmm3
  0.04 │       unpcklps  %xmm2,%xmm3
  0.59 │       movss    
mozilla::gfx::ConvertComponentTransferFunctionToFilter(mozilla::gfx::ComponentTransferAttributes
const&, int, int, mozilla::gfx::DrawTarget*, RefPtr<m
  1.00 │       addps     %xmm2,%xmm3
  0.10 │       movaps    %xmm3,%xmm4
  0.82 │       shufps    $0xe5,%xmm3,%xmm4
  3.05 │       mulss     %xmm4,%xmm4
  0.09 │       movaps    %xmm1,%xmm5
  0.12 │       mulss     %xmm1,%xmm5
  2.77 │       addss     %xmm4,%xmm5
  0.06 │       movaps    %xmm3,%xmm4
  0.00 │       mulss     %xmm3,%xmm4
  2.95 │       addss     %xmm5,%xmm4
  9.54 │       sqrtss    %xmm4,%xmm4
  8.84 │       divss     %xmm4,%xmm1
  0.08 │       shufps    $0xe0,%xmm4,%xmm4
  2.45 │       divps     %xmm4,%xmm3
  0.88 │       mulss     0x4(%rsi),%xmm1
  0.01 │       movss     (%rsi),%xmm4
       │       movss     0x8(%rsi),%xmm5
  0.02 │       unpcklps  %xmm4,%xmm5
  2.82 │       mulps     %xmm3,%xmm5
  0.03 │       movaps    %xmm5,%xmm3
  0.88 │       shufps    $0xe5,%xmm5,%xmm3
  3.47 │       addss     %xmm1,%xmm3
  3.39 │       addss     %xmm5,%xmm3
  3.09 │       cmpless   %xmm3,%xmm0
  1.77 │       andps     %xmm2,%xmm0
  3.01 │       mulss     %xmm3,%xmm0
  3.25 │       mulss    
mozIGeckoMediaPluginService::COMTypeInfo<mozIGeckoMediaPluginService,
void,%xmm0
  4.85 │       cvttss2si %xmm0,%eax

Reply via email to