https://gcc.gnu.org/bugzilla/show_bug.cgi?id=125937
Bug ID: 125937
Summary: [17 Regression] A TSVC testcase slower by ~40% since
r17-223-ga22b31304e0a1a
Product: gcc
Version: 17.0
Status: UNCONFIRMED
Keywords: missed-optimization
Severity: normal
Priority: P3
Component: tree-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: pheeck at gcc dot gnu.org
CC: rguenth at gcc dot gnu.org
Target Milestone: ---
Host: x86_64-pc-linux-gnu
Target: x86_64-pc-linux-gnu
Created attachment 64826
--> https://gcc.gnu.org/bugzilla/attachment.cgi?id=64826&action=edit
s4115 testcase with all the other necessary TSVC files
The TSVC benchmark suite testcases s4115 and s4116 slowed down.
Compile flags: *.c -Ofast -march=native -lm
CPU: Zen3, though I've seen this on other machines
I've bisected this to r17-223-ga22b31304e0a1a
commit a22b31304e0a1ad21751f882c02c32c167c78793
Author: Richard Biener <[email protected]>
AuthorDate: Fri Apr 24 14:35:49 2026 +0200
Commit: Richard Biener <[email protected]>
CommitDate: Thu Apr 30 08:13:03 2026 +0200
flip --param ix86-vect-compare-costs default
Here are the testcases:
----
//int s4115(int* __restrict__ ip)
real_t s4115(struct args_t * func_args)
{
// indirect addressing
// sparse dot product
// gather is required
int * __restrict__ ip = func_args->arg_info;
initialise_arrays(__func__);
gettimeofday(&func_args->t1, NULL);
real_t sum;
for (int nl = 0; nl < iterations; nl++) {
sum = 0.;
for (int i = 0; i < LEN_1D; i++) {
sum += a[i] * b[ip[i]];
}
dummy(a, b, c, d, e, aa, bb, cc, 0.);
}
gettimeofday(&func_args->t2, NULL);
return sum;
}
// %4.11
//int s4116(int* __restrict__ ip, int j, int inc)
real_t s4116(struct args_t * func_args)
{
// indirect addressing
// more complicated sparse sdot
// gather is required
struct{int * __restrict__ a;int b;int c;} * x = func_args->arg_info;
int * __restrict__ ip = x->a;
int j = x->b;
int inc = x->c;
initialise_arrays(__func__);
gettimeofday(&func_args->t1, NULL);
real_t sum;
int off;
for (int nl = 0; nl < 100*iterations; nl++) {
sum = 0.;
for (int i = 0; i < LEN_2D-1; i++) {
off = inc + i;
sum += a[off] * aa[j-1][ip[i]];
}
dummy(a, b, c, d, e, aa, bb, cc, 0.);
}
gettimeofday(&func_args->t2, NULL);
return sum;
}
----
I've packaged s4115 together with all the other necessary TSVC sources into an
archive and attach it to this report.
Here are the differences I see in the dissassembly of tsvc.c from the
attachement. The '-' correspond to r17-222, the '+' to r17-223:
--- before-culprit/tsvc.s 2026-06-22 15:03:03.674209255 +0200
+++ culprit/tsvc.s 2026-06-22 15:05:14.190621248 +0200
@@ -27,26 +27,24 @@
call gettimeofday
.L2:
xorl %eax, %eax
- vxorps %xmm2, %xmm2, %xmm2
+ vxorps %xmm0, %xmm0, %xmm0
+ .p2align 6
.p2align 4
.p2align 3
.L3:
leaq (%r12,%rax), %rdx
- addq $16, %rax
- movslq (%rdx), %rsi
- movslq 8(%rdx), %rdi
- movslq 4(%rdx), %rcx
- movslq 12(%rdx), %rdx
- vmovss b(,%rdi,4), %xmm1
- vmovss b(,%rsi,4), %xmm0
+ vmovq a(%rax), %xmm2
+ addq $8, %rax
+ movslq (%rdx), %rcx
+ movslq 4(%rdx), %rdx
+ vmovss b(,%rcx,4), %xmm1
vinsertps $0x10, b(,%rdx,4), %xmm1, %xmm1
- vinsertps $0x10, b(,%rcx,4), %xmm0, %xmm0
- vmovlhps %xmm1, %xmm0, %xmm0
- vmulps a-16(%rax), %xmm0, %xmm0
- vaddps %xmm0, %xmm2, %xmm2
+ vmulps %xmm2, %xmm1, %xmm1
+ vaddps %xmm0, %xmm1, %xmm1
+ vmovaps %xmm1, %xmm0
cmpq $128000, %rax
jne .L3
- vmovaps %xmm2, (%rsp)
+ vmovaps %xmm1, (%rsp)
movl $c, %edx
pushq $cc
.cfi_def_cfa_offset 56
@@ -68,7 +66,7 @@
leaq 16(%rbp), %rdi
xorl %esi, %esi
call gettimeofday
- vmovaps (%rsp), %xmm2
+ vmovaps (%rsp), %xmm1
addq $16, %rsp
.cfi_def_cfa_offset 32
popq %rbx
@@ -77,10 +75,9 @@
.cfi_def_cfa_offset 16
popq %r12
.cfi_def_cfa_offset 8
- vmovhlps %xmm2, %xmm2, %xmm1
- vaddps %xmm2, %xmm1, %xmm1
- vshufps $85, %xmm1, %xmm1, %xmm0
- vaddps %xmm1, %xmm0, %xmm0
+ vpsrlq $32, %xmm1, %xmm0
+ vaddps %xmm0, %xmm1, %xmm1
+ vmovaps %xmm1, %xmm0
ret
.cfi_endproc
.LFE11: