On Fri, Nov 15, 2013 at 09:17:14AM -0800, Hendrik Greving wrote: > Also keep in mind that usually costs go up significantly if > misalignment causes cache line splits (processor will fetch 2 lines). > There are non-linear costs of filling up the store queue in modern > out-of-order processors (x86). Bottom line is that it's much better to > peel e.g. for AVX2/AVX3 if the loop would cause loads that cross cache > line boundaries otherwise. The solution is to either actually always > peel for alignment, or insert an additional check for cache line > boundaries (for high trip count loops).
That is quite bold claim do you have a benchmark to support that? Since nehalem there is no overhead of unaligned sse loads except of fetching cache lines. As haswell avx2 loads behave in similar way. You are forgetting that loop needs both cache lines when it issues unaligned load. This will generaly take maximum of times needed to access these lines. Now with peeling you accesss first cache line, and after that in loop access the second, effectively doubling running time when both lines were in main memory. You also need to compute all factors not just that one factor is expensive. There are several factor in plays, cost of branch misprediction is main argument againist doing peeling, so you need to show that cost of unaligned loads is bigger than cost of branch misprediction of a peeled implementation. As a quick example why peeling is generaly bad idea I did a simple benchmark. Could somebody with haswell also test attached code generated by gcc -O3 -march=core-avx2 (files set[13]_avx2.s)? For the test we repeately call a function set with a pointer randomly picked from 262144 bytes to stress a L2 cache, relevant tester is following (file test.c) for (i=0;i<100000000;i++){ set (ptr + 64 * (p % (SIZE /64) + 60), ptr2 + 64 * (q % (SIZE /64) + 60)); First vectorize by following function. A vectorizer here does peeling (assembly is bit long, see file set1.s) void set(int *p, int *q){ int i; for (i=0; i<128; i++) p[i] = 42 * p[i]; } When ran it I got $ gcc -O3 -DSIZE= test.c $ gcc test.o set1.s $ time ./a.out real 0m3.724s user 0m3.724s sys 0m0.000s Now what happens if we use separate input and output arrays? A gcc vectorizer fortunately does not peel in this case (file set2.s) which gives better performance void set(int *p, int *q){ int i; for (i=0; i<128; i++) p[i] = 42 * q[i]; } $ gcc test.o set2.s $ time ./a.out real 0m3.169s user 0m3.170s sys 0m0.000s A speedup here is can be partialy explained by fact that inplace modifications run slower. To eliminate this possibility we change assembly to make input same as output (file set3.s) jb .L15 .L7: xorl %eax, %eax + movq %rdi, %rsi .p2align 4,,10 .p2align 3 .L5: $ gcc test.o set3.s $ time ./a.out real 0m3.169s user 0m3.170s sys 0m0.000s Which is still faster than what peeling vectorizer generated. And in this test I did not alignment is constant so branch misprediction is not a issue.
#define _GNU_SOURCE #include <stdlib.h> int main(){ char *ptr = pvalloc(2 * SIZE + 128); char *ptr2 = pvalloc(2 * SIZE + 128); unsigned long p = 31; unsigned long q = 17; int i; for (i=0; i < 100000000; i++) { set (ptr + 64 * (p % (SIZE / 64) + 60), ptr2 + 64 * (q % (SIZE /64) + 60)); p = 11 * p + 3; q = 13 * p + 5; } }
.file "set1.c" .text .p2align 4,,15 .globl set .type set, @function set: .LFB0: .cfi_startproc leaq 32(%rdi), %rax cmpq %rax, %rsi jb .L12 movq %rdi, %rsi .L6: vmovdqu (%rsi), %ymm1 vmovdqa .LC0(%rip), %ymm0 vpmulld %ymm0, %ymm1, %ymm1 vmovdqu %ymm1, (%rdi) vmovdqu 32(%rsi), %ymm1 vpmulld %ymm0, %ymm1, %ymm1 vmovdqu %ymm1, 32(%rdi) vmovdqu 64(%rsi), %ymm1 vpmulld %ymm0, %ymm1, %ymm1 vmovdqu %ymm1, 64(%rdi) vmovdqu 96(%rsi), %ymm1 vpmulld %ymm0, %ymm1, %ymm1 vmovdqu %ymm1, 96(%rdi) vmovdqu 128(%rsi), %ymm1 vpmulld %ymm0, %ymm1, %ymm1 vmovdqu %ymm1, 128(%rdi) vmovdqu 160(%rsi), %ymm1 vpmulld %ymm0, %ymm1, %ymm1 vmovdqu %ymm1, 160(%rdi) vmovdqu 192(%rsi), %ymm1 vpmulld %ymm0, %ymm1, %ymm1 vmovdqu %ymm1, 192(%rdi) vmovdqu 224(%rsi), %ymm1 vpmulld %ymm0, %ymm1, %ymm1 vmovdqu %ymm1, 224(%rdi) vmovdqu 256(%rsi), %ymm1 vpmulld %ymm0, %ymm1, %ymm1 vmovdqu %ymm1, 256(%rdi) vmovdqu 288(%rsi), %ymm1 vpmulld %ymm0, %ymm1, %ymm1 vmovdqu %ymm1, 288(%rdi) vmovdqu 320(%rsi), %ymm1 vpmulld %ymm0, %ymm1, %ymm1 vmovdqu %ymm1, 320(%rdi) vmovdqu 352(%rsi), %ymm1 vpmulld %ymm0, %ymm1, %ymm1 vmovdqu %ymm1, 352(%rdi) vmovdqu 384(%rsi), %ymm1 vpmulld %ymm0, %ymm1, %ymm1 vmovdqu %ymm1, 384(%rdi) vmovdqu 416(%rsi), %ymm1 vpmulld %ymm0, %ymm1, %ymm1 vmovdqu %ymm1, 416(%rdi) vmovdqu 448(%rsi), %ymm1 vpmulld %ymm0, %ymm1, %ymm1 vmovdqu %ymm1, 448(%rdi) vmovdqu 480(%rsi), %ymm1 vpmulld %ymm0, %ymm1, %ymm0 vmovdqu %ymm0, 480(%rdi) vzeroupper ret .p2align 4,,10 .p2align 3 .L12: leaq 32(%rsi), %rax cmpq %rax, %rdi jae .L6 xorl %eax, %eax .p2align 4,,10 .p2align 3 .L5: movl (%rsi,%rax), %edx movl $42, %ecx imull %ecx, %edx movl %edx, (%rdi,%rax) addq $4, %rax cmpq $512, %rax jne .L5 rep ret .cfi_endproc .LFE0: .size set, .-set .section .rodata.cst32,"aM",@progbits,32 .align 32 .LC0: .long 42 .long 42 .long 42 .long 42 .long 42 .long 42 .long 42 .long 42 .ident "GCC: (Debian 4.8.1-10) 4.8.1" .section .note.GNU-stack,"",@progbits
.file "set2.c" .text .p2align 4,,15 .globl set .type set, @function set: .LFB0: .cfi_startproc leaq 16(%rdi), %rax cmpq %rax, %rsi jb .L15 .L7: xorl %eax, %eax .p2align 4,,10 .p2align 3 .L5: movdqu (%rsi,%rax), %xmm1 pslld $1, %xmm1 movdqa %xmm1, %xmm0 pslld $2, %xmm0 psubd %xmm1, %xmm0 movdqa %xmm0, %xmm1 pslld $3, %xmm1 psubd %xmm0, %xmm1 movdqu %xmm1, (%rdi,%rax) addq $16, %rax cmpq $512, %rax jne .L5 rep ret .p2align 4,,10 .p2align 3 .L15: leaq 16(%rsi), %rax cmpq %rax, %rdi jae .L7 xorl %eax, %eax .p2align 4,,10 .p2align 3 .L6: movl (%rsi,%rax), %edx movl $42, %ecx imull %ecx, %edx movl %edx, (%rdi,%rax) addq $4, %rax cmpq $512, %rax jne .L6 rep ret .cfi_endproc .LFE0: .size set, .-set .ident "GCC: (Debian 4.8.1-10) 4.8.1" .section .note.GNU-stack,"",@progbits
.file "set2.c" .text .p2align 4,,15 .globl set .type set, @function set: .LFB0: .cfi_startproc leaq 16(%rdi), %rax cmpq %rax, %rsi jb .L15 .L7: xorl %eax, %eax movq %rdi, %rsi .p2align 4,,10 .p2align 3 .L5: movdqu (%rsi,%rax), %xmm1 pslld $1, %xmm1 movdqa %xmm1, %xmm0 pslld $2, %xmm0 psubd %xmm1, %xmm0 movdqa %xmm0, %xmm1 pslld $3, %xmm1 psubd %xmm0, %xmm1 movdqu %xmm1, (%rdi,%rax) addq $16, %rax cmpq $512, %rax jne .L5 rep ret .p2align 4,,10 .p2align 3 .L15: leaq 16(%rsi), %rax cmpq %rax, %rdi jae .L7 xorl %eax, %eax .p2align 4,,10 .p2align 3 .L6: movl (%rsi,%rax), %edx movl $42, %ecx imull %ecx, %edx movl %edx, (%rdi,%rax) addq $4, %rax cmpq $512, %rax jne .L6 rep ret .cfi_endproc .LFE0: .size set, .-set .ident "GCC: (Debian 4.8.1-10) 4.8.1" .section .note.GNU-stack,"",@progbits
.file "set1.c" .text .p2align 4,,15 .globl set .type set, @function set: .LFB0: .cfi_startproc movq %rdi, %rax andl $31, %eax shrq $2, %rax negq %rax andl $7, %eax je .L7 movl (%rdi), %edx movl $42, %r11d imull %r11d, %edx cmpl $1, %eax movl %edx, (%rdi) jbe .L8 movl 4(%rdi), %edx movl $42, %r10d imull %r10d, %edx cmpl $2, %eax movl %edx, 4(%rdi) jbe .L9 movl 8(%rdi), %edx movl $42, %r9d imull %r9d, %edx cmpl $3, %eax movl %edx, 8(%rdi) jbe .L10 movl 12(%rdi), %edx movl $42, %r8d imull %r8d, %edx cmpl $4, %eax movl %edx, 12(%rdi) jbe .L11 movl 16(%rdi), %edx movl $42, %esi imull %esi, %edx cmpl $5, %eax movl %edx, 16(%rdi) jbe .L12 movl 20(%rdi), %edx movl $42, %ecx imull %ecx, %edx cmpl $6, %eax movl %edx, 20(%rdi) jbe .L13 movl 24(%rdi), %edx movl $42, %r11d movl $7, %r9d imull %r11d, %edx movl %edx, 24(%rdi) movl $121, %edx .L2: movl $128, %ecx vmovdqa .LC0(%rip), %ymm0 subl %eax, %ecx movl %eax, %eax leaq (%rdi,%rax,4), %rax movl %ecx, %r8d shrl $3, %r8d vpmulld (%rax), %ymm0, %ymm1 vmovdqa %ymm1, (%rax) cmpl $15, %r8d vpmulld 32(%rax), %ymm0, %ymm1 vmovdqa %ymm1, 32(%rax) leal 0(,%r8,8), %esi vpmulld 64(%rax), %ymm0, %ymm1 vmovdqa %ymm1, 64(%rax) vpmulld 96(%rax), %ymm0, %ymm1 vmovdqa %ymm1, 96(%rax) vpmulld 128(%rax), %ymm0, %ymm1 vmovdqa %ymm1, 128(%rax) vpmulld 160(%rax), %ymm0, %ymm1 vmovdqa %ymm1, 160(%rax) vpmulld 192(%rax), %ymm0, %ymm1 vmovdqa %ymm1, 192(%rax) vpmulld 224(%rax), %ymm0, %ymm1 vmovdqa %ymm1, 224(%rax) vpmulld 256(%rax), %ymm0, %ymm1 vmovdqa %ymm1, 256(%rax) vpmulld 288(%rax), %ymm0, %ymm1 vmovdqa %ymm1, 288(%rax) vpmulld 320(%rax), %ymm0, %ymm1 vmovdqa %ymm1, 320(%rax) vpmulld 352(%rax), %ymm0, %ymm1 vmovdqa %ymm1, 352(%rax) vpmulld 384(%rax), %ymm0, %ymm1 vmovdqa %ymm1, 384(%rax) vpmulld 416(%rax), %ymm0, %ymm1 vmovdqa %ymm1, 416(%rax) vpmulld 448(%rax), %ymm0, %ymm1 vmovdqa %ymm1, 448(%rax) jbe .L4 vpmulld 480(%rax), %ymm0, %ymm0 vmovdqa %ymm0, 480(%rax) .L4: leal (%r9,%rsi), %eax subl %esi, %edx cmpl %esi, %ecx je .L29 movslq %eax, %rcx movl $42, %r9d leaq (%rdi,%rcx,4), %rcx movl (%rcx), %esi imull %r9d, %esi cmpl $1, %edx movl %esi, (%rcx) leal 1(%rax), %ecx je .L29 movslq %ecx, %rcx movl $42, %r8d leaq (%rdi,%rcx,4), %rcx movl (%rcx), %esi imull %r8d, %esi cmpl $2, %edx movl %esi, (%rcx) leal 2(%rax), %ecx je .L29 movslq %ecx, %rcx movl $42, %r11d leaq (%rdi,%rcx,4), %rcx movl (%rcx), %esi imull %r11d, %esi cmpl $3, %edx movl %esi, (%rcx) leal 3(%rax), %ecx je .L29 movslq %ecx, %rcx movl $42, %r10d leaq (%rdi,%rcx,4), %rcx movl (%rcx), %esi imull %r10d, %esi cmpl $4, %edx movl %esi, (%rcx) leal 4(%rax), %ecx je .L29 movslq %ecx, %rcx movl $42, %r9d leaq (%rdi,%rcx,4), %rcx movl (%rcx), %esi imull %r9d, %esi cmpl $5, %edx movl %esi, (%rcx) leal 5(%rax), %ecx je .L29 movslq %ecx, %rcx movl $42, %r8d addl $6, %eax leaq (%rdi,%rcx,4), %rcx movl (%rcx), %esi imull %r8d, %esi cmpl $6, %edx movl %esi, (%rcx) je .L29 cltq movl $42, %r10d leaq (%rdi,%rax,4), %rax movl (%rax), %edx imull %r10d, %edx movl %edx, (%rax) .L29: vzeroupper ret .p2align 4,,10 .p2align 3 .L7: movl $128, %edx xorl %r9d, %r9d jmp .L2 .p2align 4,,10 .p2align 3 .L13: movl $122, %edx movl $6, %r9d jmp .L2 .p2align 4,,10 .p2align 3 .L8: movl $127, %edx movl $1, %r9d jmp .L2 .p2align 4,,10 .p2align 3 .L9: movl $126, %edx movl $2, %r9d jmp .L2 .p2align 4,,10 .p2align 3 .L10: movl $125, %edx movl $3, %r9d jmp .L2 .p2align 4,,10 .p2align 3 .L11: movl $124, %edx movl $4, %r9d jmp .L2 .p2align 4,,10 .p2align 3 .L12: movl $123, %edx movl $5, %r9d jmp .L2 .cfi_endproc .LFE0: .size set, .-set .section .rodata.cst32,"aM",@progbits,32 .align 32 .LC0: .long 42 .long 42 .long 42 .long 42 .long 42 .long 42 .long 42 .long 42 .ident "GCC: (Debian 4.8.1-10) 4.8.1" .section .note.GNU-stack,"",@progbits
.file "set1.c" .text .p2align 4,,15 .globl set .type set, @function set: .LFB0: .cfi_startproc leaq 32(%rdi), %rax cmpq %rax, %rsi jb .L12 movq %rdi, %rsi .L6: vmovdqu (%rsi), %ymm1 vmovdqa .LC0(%rip), %ymm0 vpmulld %ymm0, %ymm1, %ymm1 vmovdqu %ymm1, (%rdi) vmovdqu 32(%rsi), %ymm1 vpmulld %ymm0, %ymm1, %ymm1 vmovdqu %ymm1, 32(%rdi) vmovdqu 64(%rsi), %ymm1 vpmulld %ymm0, %ymm1, %ymm1 vmovdqu %ymm1, 64(%rdi) vmovdqu 96(%rsi), %ymm1 vpmulld %ymm0, %ymm1, %ymm1 vmovdqu %ymm1, 96(%rdi) vmovdqu 128(%rsi), %ymm1 vpmulld %ymm0, %ymm1, %ymm1 vmovdqu %ymm1, 128(%rdi) vmovdqu 160(%rsi), %ymm1 vpmulld %ymm0, %ymm1, %ymm1 vmovdqu %ymm1, 160(%rdi) vmovdqu 192(%rsi), %ymm1 vpmulld %ymm0, %ymm1, %ymm1 vmovdqu %ymm1, 192(%rdi) vmovdqu 224(%rsi), %ymm1 vpmulld %ymm0, %ymm1, %ymm1 vmovdqu %ymm1, 224(%rdi) vmovdqu 256(%rsi), %ymm1 vpmulld %ymm0, %ymm1, %ymm1 vmovdqu %ymm1, 256(%rdi) vmovdqu 288(%rsi), %ymm1 vpmulld %ymm0, %ymm1, %ymm1 vmovdqu %ymm1, 288(%rdi) vmovdqu 320(%rsi), %ymm1 vpmulld %ymm0, %ymm1, %ymm1 vmovdqu %ymm1, 320(%rdi) vmovdqu 352(%rsi), %ymm1 vpmulld %ymm0, %ymm1, %ymm1 vmovdqu %ymm1, 352(%rdi) vmovdqu 384(%rsi), %ymm1 vpmulld %ymm0, %ymm1, %ymm1 vmovdqu %ymm1, 384(%rdi) vmovdqu 416(%rsi), %ymm1 vpmulld %ymm0, %ymm1, %ymm1 vmovdqu %ymm1, 416(%rdi) vmovdqu 448(%rsi), %ymm1 vpmulld %ymm0, %ymm1, %ymm1 vmovdqu %ymm1, 448(%rdi) vmovdqu 480(%rsi), %ymm1 vpmulld %ymm0, %ymm1, %ymm0 vmovdqu %ymm0, 480(%rdi) vzeroupper ret .p2align 4,,10 .p2align 3 .L12: leaq 32(%rsi), %rax cmpq %rax, %rdi jae .L6 xorl %eax, %eax .p2align 4,,10 .p2align 3 .L5: movl (%rsi,%rax), %edx movl $42, %ecx imull %ecx, %edx movl %edx, (%rdi,%rax) addq $4, %rax cmpq $512, %rax jne .L5 rep ret .cfi_endproc .LFE0: .size set, .-set .section .rodata.cst32,"aM",@progbits,32 .align 32 .LC0: .long 42 .long 42 .long 42 .long 42 .long 42 .long 42 .long 42 .long 42 .ident "GCC: (Debian 4.8.1-10) 4.8.1" .section .note.GNU-stack,"",@progbits