On Fri, Nov 15, 2013 at 11:26:06PM +0100, Ondřej Bílka wrote:
Minor correction, a mutt read replaced a set1.s file by one that I later
used for avx2 variant. A correct file is following
.file "set1.c"
.text
.p2align 4,,15
.globl set
.type set, @function
set:
.LFB0:
.cfi_startproc
movq %rdi, %rax
andl $15, %eax
shrq $2, %rax
negq %rax
andl $3, %eax
je .L9
movl (%rdi), %edx
movl $42, %esi
imull %esi, %edx
cmpl $1, %eax
movl %edx, (%rdi)
jbe .L10
movl 4(%rdi), %edx
movl $42, %ecx
imull %ecx, %edx
cmpl $2, %eax
movl %edx, 4(%rdi)
jbe .L11
movl 8(%rdi), %edx
movl $42, %r11d
movl $125, %r10d
imull %r11d, %edx
movl $3, %r11d
movl %edx, 8(%rdi)
.L2:
movl $128, %r8d
xorl %edx, %edx
subl %eax, %r8d
movl %eax, %eax
movl %r8d, %esi
leaq (%rdi,%rax,4), %rcx
xorl %eax, %eax
shrl $2, %esi
leal 0(,%rsi,4), %r9d
.p2align 4,,10
.p2align 3
.L8:
movdqa (%rcx,%rax), %xmm1
addl $1, %edx
pslld $1, %xmm1
movdqa %xmm1, %xmm0
pslld $2, %xmm0
psubd %xmm1, %xmm0
movdqa %xmm0, %xmm1
pslld $3, %xmm1
psubd %xmm0, %xmm1
movdqa %xmm1, (%rcx,%rax)
addq $16, %rax
cmpl %edx, %esi
ja .L8
movl %r10d, %ecx
leal (%r11,%r9), %eax
subl %r9d, %ecx
cmpl %r9d, %r8d
je .L1
movslq %eax, %rdx
movl $42, %r9d
leaq (%rdi,%rdx,4), %rdx
movl (%rdx), %esi
imull %r9d, %esi
cmpl $1, %ecx
movl %esi, (%rdx)
leal 1(%rax), %edx
je .L1
movslq %edx, %rdx
movl $42, %r8d
addl $2, %eax
leaq (%rdi,%rdx,4), %rdx
movl (%rdx), %esi
imull %r8d, %esi
cmpl $2, %ecx
movl %esi, (%rdx)
je .L1
cltq
movl $42, %r10d
leaq (%rdi,%rax,4), %rax
movl (%rax), %edx
imull %r10d, %edx
movl %edx, (%rax)
ret
.p2align 4,,10
.p2align 3
.L1:
rep ret
.p2align 4,,10
.p2align 3
.L9:
movl $128, %r10d
xorl %r11d, %r11d
jmp .L2
.p2align 4,,10
.p2align 3
.L11:
movl $126, %r10d
movl $2, %r11d
jmp .L2
.p2align 4,,10
.p2align 3
.L10:
movl $127, %r10d
movl $1, %r11d
jmp .L2
.cfi_endproc
.LFE0:
.size set, .-set
.ident "GCC: (Debian 4.8.1-10) 4.8.1"
.section .note.GNU-stack,"",@progbits