http://gcc.gnu.org/bugzilla/show_bug.cgi?id=55953



             Bug #: 55953

           Summary: hand loop faster then builtin memset

    Classification: Unclassified

           Product: gcc

           Version: unknown

            Status: UNCONFIRMED

          Severity: normal

          Priority: P3

         Component: c

        AssignedTo: unassig...@gcc.gnu.org

        ReportedBy: dushis...@mail.ru





variant 1:

char c[100];



void f(void)

{

        for(int i=0; i < 100; ++i)

                c[i] = '0';

}



assembly:

push   %rbp

vmovdqa 0x117(%rip),%ymm0        # 0x400960

mov    %rsp,%rbp

pop    %rbp

movb   $0x30,0x20086c(%rip)# 0x6010c0 <c+96>

vmovdqa %ymm0,0x200804(%rip)# 0x601060 <c>

vmovdqa %ymm0,0x20081c(%rip)# 0x601080 <c+32>

vmovdqa %ymm0,0x200834(%rip)# 0x6010a0 <c+64>

movb   $0x30,0x20084e(%rip)# 0x6010c1 <c+97>

movb   $0x30,0x200848(%rip)# 0x6010c2 <c+98>

movb   $0x30,0x200842(%rip)# 0x6010c3 <c+99>

vzeroupper 

retq   





variant 2:

char c[100];



void f(void)

{

        memset(c, '0', 100);

}



assembly:

movabs $0x3030303030303030,%rax

movl   $0x30303030,0x20086c(%rip)        # 0x6010c0 <c+96>

mov    %rax,0x200805(%rip)        # 0x601060 <c>

mov    %rax,0x200806(%rip)        # 0x601068 <c+8>

mov    %rax,0x200807(%rip)        # 0x601070 <c+16>

mov    %rax,0x200808(%rip)        # 0x601078 <c+24>

mov    %rax,0x200809(%rip)        # 0x601080 <c+32>

mov    %rax,0x20080a(%rip)        # 0x601088 <c+40>

mov    %rax,0x20080b(%rip)        # 0x601090 <c+48>

mov    %rax,0x20080c(%rip)        # 0x601098 <c+56>

mov    %rax,0x20080d(%rip)        # 0x6010a0 <c+64>

mov    %rax,0x20080e(%rip)        # 0x6010a8 <c+72>

mov    %rax,0x20080f(%rip)        # 0x6010b0 <c+80>

mov    %rax,0x200810(%rip)        # 0x6010b8 <c+88>

retq   



The first variants take (for (size_t i = 0; i < 10000000; ++i) f();):

0.150000 secs, abs 0.150255

The second variants take:

0.170000 secs, abs 0.175502



CPU Intel i7, 

gcc --version

gcc (Gentoo 4.7.2 p1.3, pie-0.5.5) 4.7.2

compile options: -Ofast -march=native



Expected behaviour: 

assembly code should be the same, and it should be variant 1, or faster.

Reply via email to