https://gcc.gnu.org/bugzilla/show_bug.cgi?id=22141

--- Comment #44 from Jakub Jelinek <jakub at gcc dot gnu.org> ---
With the #c43 patch, the included store_merging_10.c improves on x86_64 from:
        movzbl  (%rdi), %eax
-       andl    $-19, %eax
+       andl    $-32, %eax
        orl     $13, %eax
        movb    %al, (%rdi)
in foo and
-       orb     $1, (%rdi)
        movl    (%rdi), %eax
-       andl    $-131071, %eax
+       andl    $2147352576, %eax
+       orl     $1, %eax
        movl    %eax, (%rdi)
-       shrl    $24, %eax
-       andl    $127, %eax
-       movb    %al, 3(%rdi)
in bar.  foo is something combine.c managed to optimize too, but bar it
couldn't.
In store_merging_11.c on x86_64, bar is the same and foo changed:
-       movabsq $578437695752115969, %rax
-       movl    $0, 9(%rdi)
-       movb    $0, 15(%rdi)
-       movq    %rax, 1(%rdi)
-       xorl    %eax, %eax
-       movw    %ax, 13(%rdi)
+       movl    $23, %eax
+       movb    $1, 1(%rdi)
+       movl    $117835012, 4(%rdi)
+       movw    %ax, 2(%rdi)
+       movq    $8, 8(%rdi)
which is not only shorter, but all the stores are aligned.
On ppc64le in store_merging_10.c the difference is:
-       lwz 9,0(3)
+       lbz 9,0(3)
        rlwinm 9,9,0,0,26
        ori 9,9,0xd
-       stw 9,0(3)
+       stb 9,0(3)
in foo and
        lwz 9,0(3)
+       rlwinm 9,9,0,1,14
        ori 9,9,0x1
-       rlwinm 9,9,0,31,14
-       rlwinm 9,9,0,1,31
        stw 9,0(3)
in bar, and store_merging_11.c the difference is:
-       lis 8,0x807
-       li 9,0
-       ori 8,8,0x605
-       li 10,0
-       sldi 8,8,32
-       stw 9,9(3)
-       sth 9,13(3)
-       oris 8,8,0x400
-       stb 10,15(3)
-       ori 8,8,0x1701
-       mtvsrd 0,8
-       stfd 0,1(3)
+       lis 9,0x706
+       li 7,1
+       li 8,23
+       ori 9,9,0x504
+       li 10,8
+       stb 7,1(3)
+       sth 8,2(3)
+       stw 9,4(3)
+       std 10,8(3)
in foo and no changes in bar.

What the patch doesn't implement yet, but could be also possible for
allow_unaligned case is in store_merging_11.c when we are storing 15 bytes
store
8 bytes at offset 1 and 8 bytes at offset 8 (i.e. create two overlapping
stores, in this case one aligned and one unaligned).

Reply via email to