https://gcc.gnu.org/bugzilla/show_bug.cgi?id=57952

--- Comment #3 from mmokrejs at gmail dot com ---
Weird, why g++ can do the task although probably less efficiently than icc?

$ g++ -O3 -march=core-avx-i -mtune=core-avx-i -mavx2 stream.c  ; objdump -d
a.out | grep ymm
stream.c:106:48: warning: deprecated conversion from string constant to 'char*'
[-Wwrite-strings]
     "Add:       ", "Triad:     ", "Dot:       "};
                                                ^
stream.c:106:48: warning: deprecated conversion from string constant to 'char*'
[-Wwrite-strings]
stream.c:106:48: warning: deprecated conversion from string constant to 'char*'
[-Wwrite-strings]
stream.c:106:48: warning: deprecated conversion from string constant to 'char*'
[-Wwrite-strings]
stream.c:106:48: warning: deprecated conversion from string constant to 'char*'
[-Wwrite-strings]
  4006ac:       c5 fd 28 0d 2c 11 00    vmovapd 0x112c(%rip),%ymm1        #
4017e0 <_ZL5label+0xa0>
  4006b4:       c5 fd 28 05 44 11 00    vmovapd 0x1144(%rip),%ymm0        #
401800 <_ZL5label+0xc0>
  4006c0:       c5 fd 29 88 80 61 ab    vmovapd %ymm1,0x4cab6180(%rax)
  4006cc:       c5 fd 29 80 60 c1 85    vmovapd %ymm0,0x2685c160(%rax)
  400738:       c5 fd 28 01             vmovapd (%rcx),%ymm0
  400740:       c5 fd 58 c0             vaddpd %ymm0,%ymm0,%ymm0
  400744:       c5 fd 29 41 e0          vmovapd %ymm0,-0x20(%rcx)
  400803:       c5 fd 28 15 15 10 00    vmovapd 0x1015(%rip),%ymm2        #
401820 <_ZL5label+0xe0>
  400817:       c5 fd 29 95 d0 fd ff    vmovapd %ymm2,-0x230(%rbp)
  4008d0:       c5 fd 28 95 d0 fd ff    vmovapd -0x230(%rbp),%ymm2
  400908:       c5 fd 29 10             vmovapd %ymm2,(%rax)
  40091c:       c5 fd 29 95 d0 fd ff    vmovapd %ymm2,-0x230(%rbp)
  40098c:       c5 fd 28 95 d0 fd ff    vmovapd -0x230(%rbp),%ymm2
  4009b8:       c5 fd 28 80 80 61 ab    vmovapd 0x4cab6180(%rax),%ymm0
  4009c4:       c5 fd 58 80 60 c1 85    vaddpd 0x2685c160(%rax),%ymm0,%ymm0
  4009cc:       c5 fd 29 80 60 21 60    vmovapd %ymm0,0x602160(%rax)
  4009e3:       c5 fd 29 95 d0 fd ff    vmovapd %ymm2,-0x230(%rbp)
  400a56:       c5 fd 28 95 d0 fd ff    vmovapd -0x230(%rbp),%ymm2
  400a80:       c5 ed 59 80 80 c1 85    vmulpd 0x2685c180(%rax),%ymm2,%ymm0
  400a88:       c5 fd 58 80 80 61 ab    vaddpd 0x4cab6180(%rax),%ymm0,%ymm0
  400a94:       c5 fd 29 80 60 21 60    vmovapd %ymm0,0x602160(%rax)
  400aab:       c5 fd 29 95 d0 fd ff    vmovapd %ymm2,-0x230(%rbp)
  400b1e:       c5 fd 28 95 d0 fd ff    vmovapd -0x230(%rbp),%ymm2
  400b77:       c5 fd 29 95 d0 fd ff    vmovapd %ymm2,-0x230(%rbp)
  400bab:       c5 fd 28 95 d0 fd ff    vmovapd -0x230(%rbp),%ymm2
  401006:       c4 e3 6d 18 95 28 ff    vinsertf128
$0x1,-0xd8(%rbp),%ymm2,%ymm2
  401018:       c4 e3 7d 18 85 48 ff    vinsertf128
$0x1,-0xb8(%rbp),%ymm0,%ymm0
  401022:       c5 ed 5c 95 10 ff ff    vsubpd -0xf0(%rbp),%ymm2,%ymm2
  40102a:       c5 fd 5c 85 30 ff ff    vsubpd -0xd0(%rbp),%ymm0,%ymm0
  401032:       c5 fd 28 25 66 07 00    vmovapd 0x766(%rip),%ymm4        #
4017a0 <_ZL5label+0x60>
  401044:       c5 ed 59 d4             vmulpd %ymm4,%ymm2,%ymm2
  401048:       c5 fd 59 c4             vmulpd %ymm4,%ymm0,%ymm0
  40104c:       c5 fd e6 d2             vcvttpd2dq %ymm2,%xmm2
  401058:       c5 fd e6 c0             vcvttpd2dq %ymm0,%xmm0
  40105c:       c4 e3 6d 38 d0 01       vinserti128 $0x1,%xmm0,%ymm2,%ymm2
  401062:       c4 e2 6d 3d d3          vpmaxsd %ymm3,%ymm2,%ymm2
  401067:       c4 e2 6d 39 15 50 07    vpminsd 0x750(%rip),%ymm2,%ymm2       
# 4017c0 <_ZL5label+0x80>
  401096:       c4 e3 75 18 8d 68 ff    vinsertf128
$0x1,-0x98(%rbp),%ymm1,%ymm1
  4010a4:       c5 f5 5c 8d 50 ff ff    vsubpd -0xb0(%rbp),%ymm1,%ymm1
  4010b7:       c5 f5 59 cc             vmulpd %ymm4,%ymm1,%ymm1
  4010bb:       c5 fd e6 c9             vcvttpd2dq %ymm1,%xmm1
  4010d3:       c4 e3 7d 18 45 88 01    vinsertf128
$0x1,-0x78(%rbp),%ymm0,%ymm0
  4010da:       c5 fd 5c 85 70 ff ff    vsubpd -0x90(%rbp),%ymm0,%ymm0
  4010e2:       c5 fd 59 c4             vmulpd %ymm4,%ymm0,%ymm0
  4010e6:       c5 fd e6 c0             vcvttpd2dq %ymm0,%xmm0
  4010ea:       c4 e3 75 38 c0 01       vinserti128 $0x1,%xmm0,%ymm1,%ymm0
  4010f5:       c4 e2 7d 3d c3          vpmaxsd %ymm3,%ymm0,%ymm0
  4010fa:       c4 e2 6d 39 c0          vpminsd %ymm0,%ymm2,%ymm0
  4010ff:       c4 e3 7d 46 c8 01       vperm2i128 $0x1,%ymm0,%ymm0,%ymm1
  401105:       c4 e2 7d 39 c1          vpminsd %ymm1,%ymm0,%ymm0
  40110f:       c5 f5 73 d8 08          vpsrldq $0x8,%ymm0,%ymm1
  401114:       c4 e2 7d 39 c9          vpminsd %ymm1,%ymm0,%ymm1
  401119:       c5 fd 73 d9 04          vpsrldq $0x4,%ymm1,%ymm0
  40111e:       c4 e2 75 39 c0          vpminsd %ymm0,%ymm1,%ymm0
  4011e0:       c4 e2 7d 19 c0          vbroadcastsd %xmm0,%ymm0
  4011f0:       c5 fd 29 00             vmovapd %ymm0,(%rax)
  401218:       c5 fd 28 80 80 61 ab    vmovapd 0x4cab6180(%rax),%ymm0
  401224:       c5 fd 58 80 60 c1 85    vaddpd 0x2685c160(%rax),%ymm0,%ymm0
  40122c:       c5 fd 29 80 60 21 60    vmovapd %ymm0,0x602160(%rax)
  401240:       c4 e2 7d 19 c0          vbroadcastsd %xmm0,%ymm0
  401250:       c5 fd 59 88 80 c1 85    vmulpd 0x2685c180(%rax),%ymm0,%ymm1
  401258:       c5 f5 58 88 80 61 ab    vaddpd 0x4cab6180(%rax),%ymm1,%ymm1
  401264:       c5 fd 29 88 60 21 60    vmovapd %ymm1,0x602160(%rax)
$

Reply via email to