http://gcc.gnu.org/bugzilla/show_bug.cgi?id=53346
--- Comment #4 from Uros Bizjak <ubizjak at gmail dot com> 2012-05-17 20:09:42 UTC --- Instead of this: .L228: movl $0, -4(%rdx,%rax,4) addq $1, %rax cmpq %rax, %rsi jge .L228 vectorization generates following: movq %rdx, %rax movq %r9, %r8 andl $15, %eax shrq $2, %rax negq %rax andl $3, %eax cmpq %r9, %rax cmovbe %rax, %r8 cmpq $6, %r9 cmovbe %r9, %r8 testq %r8, %r8 je .L233 leaq 1(%r8), %rsi movl $1, %eax .p2align 4,,10 .p2align 3 .L176: movl $0, -4(%rdx,%rax,4) addq $1, %rax cmpq %rsi, %rax jne .L176 cmpq %r9, %r8 je .L182 .L174: movq %r9, %rbp subq %r8, %rbp movq %rbp, %r11 shrq $2, %r11 leaq 0(,%r11,4), %rbx testq %rbx, %rbx je .L181 pxor %xmm0, %xmm0 leaq (%rdx,%r8,4), %r8 xorl %esi, %esi .p2align 4,,10 .p2align 3 .L183: addq $1, %rsi movdqa %xmm0, (%r8) addq $16, %r8 cmpq %rsi, %r11 ja .L183 addq %rbx, %rax cmpq %rbx, %rbp je .L182 .p2align 4,,10 .p2align 3 .L181: movl $0, -4(%rdx,%rax,4) addq $1, %rax cmpq %rax, %r9 jge .L181 Whoa.