https://gcc.gnu.org/bugzilla/show_bug.cgi?id=64500
--- Comment #9 from ak at gcc dot gnu.org ---
I can test it later, but it would surprise me if it helps. The problem is not
the computation but the misses. When profiling it I see a lot of cache misses
on "cmp" memory load. So likely need to do something about the data structure.
Looking at some LBR data the list walks just seem to be too long. Several of
the iterations exceeded the 32 entry limit of the Intel LBR. A 90+ cycle
latency must be multiple cache misses. I saw up to 340 cycles just for the loop
body.
e.g. here is an excerpt with cycle data
0000000001278705 jnz 0x12786e0
# PRED 74 cycles [74]
00000000012786e0 cmpw $0x2, (%rbx)
00000000012786e4 jz 0x1278e20
00000000012786ea movq 0x20(%rbx), %rbp
00000000012786ee test %rbp, %rbp
00000000012786f1 jz 0x12786fe
00000000012786f3 cmpq $0x0, 0x38(%rbp)
00000000012786f8 jnz 0x1278868
00000000012786fe movq 0x10(%rbx), %rbx
0000000001278702 test %rbx, %rbx
0000000001278705 jnz 0x12786e0
# PRED 78 cycles [152] 0.13 IPC
00000000012786e0 cmpw $0x2, (%rbx)
00000000012786e4 jz 0x1278e20
00000000012786ea movq 0x20(%rbx), %rbp
00000000012786ee test %rbp, %rbp
00000000012786f1 jz 0x12786fe
00000000012786f3 cmpq $0x0, 0x38(%rbp)
00000000012786f8 jnz 0x1278868
00000000012786fe movq 0x10(%rbx), %rbx
0000000001278702 test %rbx, %rbx
0000000001278705 jnz 0x12786e0
# PRED 356 cycles [508] 0.03 IPC
00000000012786e0 cmpw $0x2, (%rbx)
00000000012786e4 jz 0x1278e20
00000000012786ea movq 0x20(%rbx), %rbp
00000000012786ee test %rbp, %rbp
00000000012786f1 jz 0x12786fe
00000000012786f3 cmpq $0x0, 0x38(%rbp)
00000000012786f8 jnz 0x1278868
00000000012786fe movq 0x10(%rbx), %rbx
0000000001278702 test %rbx, %rbx
0000000001278705 jnz 0x12786e0
# PRED 24 cycles [532] 0.42 IPC
00000000012786e0 cmpw $0x2, (%rbx)
00000000012786e4 jz 0x1278e20
00000000012786ea movq 0x20(%rbx), %rbp
00000000012786ee test %rbp, %rbp
00000000012786f1 jz 0x12786fe
00000000012786f3 cmpq $0x0, 0x38(%rbp)
00000000012786f8 jnz 0x1278868
00000000012786fe movq 0x10(%rbx), %rbx
0000000001278702 test %rbx, %rbx
0000000001278705 jnz 0x12786e0
# PRED 94 cycles [626] 0.11 IPC
00000000012786e0 cmpw $0x2, (%rbx)
00000000012786e4 jz 0x1278e20
00000000012786ea movq 0x20(%rbx), %rbp
00000000012786ee test %rbp, %rbp
00000000012786f1 jz 0x12786fe
00000000012786f3 cmpq $0x0, 0x38(%rbp)
00000000012786f8 jnz 0x1278868
00000000012786fe movq 0x10(%rbx), %rbx
0000000001278702 test %rbx, %rbx
0000000001278705 jnz 0x12786e0
# PRED 70 cycles [696] 0.14 IPC
...