https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111601
--- Comment #21 from Jakub Jelinek <jakub at gcc dot gnu.org> ---
Reduced testcase (though, just the function in question, not a runable
testcase):
struct tree_base
{
int code:16;
};
struct saved_scope
{
void *pad[14];
int x_processing_template_decl;
};
extern struct saved_scope *scope_chain;
struct z_candidate
{
tree_base *fn;
void *pad[11];
z_candidate *next;
int viable;
int flags;
};
__attribute__((noipa)) struct z_candidate *
splice_viable (struct z_candidate *cands, bool strict_p, bool *any_viable_p)
{
struct z_candidate *viable;
struct z_candidate **last_viable;
struct z_candidate **cand;
bool found_strictly_viable = false;
if (scope_chain->x_processing_template_decl)
strict_p = true;
viable = (z_candidate *) 0;
last_viable = &viable;
*any_viable_p = false;
cand = &cands;
while (*cand)
{
struct z_candidate *c = *cand;
if (!strict_p && (c->viable == 1 || ((int) (c->fn)->code) == 273))
{
strict_p = true;
if (viable && !found_strictly_viable)
{
*any_viable_p = false;
*last_viable = cands;
cands = viable;
viable = (z_candidate *) 0;
last_viable = &viable;
}
}
if (strict_p ? c->viable == 1 : c->viable)
{
*last_viable = c;
*cand = c->next;
c->next = (z_candidate *) 0;
last_viable = &c->next;
*any_viable_p = true;
if (c->viable == 1)
found_strictly_viable = true;
}
else
cand = &c->next;
}
return viable ? viable : cands;
}
With this and
./cc1plus -quiet -fpreprocessed -O2 -fprofile-generate -fno-exceptions
-fno-rtti -fasynchronous-unwind-tables -fno-common -fno-PIE -mcpu=power8
pr111601.ii -o pr111601.s3 -ffold-mem-offsets -da
vs.
./cc1plus -quiet -fpreprocessed -O2 -fprofile-generate -fno-exceptions
-fno-rtti -fasynchronous-unwind-tables -fno-common -fno-PIE -mcpu=power8
pr111601.ii -o pr111601.s4 -fno-fold-mem-offsets -da
the assembly difference is just
.L13:
std 9,0(10)
mr 10,9
li 5,0
+ addi 10,10,96
li 7,1
addi 4,4,1
addi 6,6,1
ld 9,96(9)
std 9,0(8)
- std 5,96(10)
+ std 5,0(10)
stb 7,0(31)
ori 2,2,0
ld 9,0(8)
cmpdi 0,9,0
beq 0,.L18
lwz 7,104(9)
li 12,1
li 5,1
cmpwi 0,7,1
beq 0,.L13
which shows the problem in a single loop. Without the pass, %r10 is set to %r9
+ 96 and 5 (NULL) is stored to it first and if the loop loops again, 9 is
stored to it. While with the pass, %r10 is set to %r9, 5 (NULL) is stored to
%r10 + 96 and then next iteration overwrites the fn pointer in the structure
rather than next.