hi there, i tried sending this ages ago but i guess some spam filters probably lost it... i see i have to be subscribed to post stuff.
on a p4 2.4GHz, here's the before: sign verify sign/s verify/s rsa 1024 bits 0.0044s 0.0002s 225.5 4139.4 and after: sign verify sign/s verify/s rsa 1024 bits 0.0033s 0.0002s 306.7 6264.2 see hacked patch below. -dean Date: Sun, 23 Mar 2003 22:08:25 -0800 (PST) From: dean gaudet <[EMAIL PROTECTED]> To: [EMAIL PROTECTED] Subject: SSE2 inner loop for bn_mul_add_words for kicks i decided to see if it really was possible to get RSA speedups using the SSE2 PMULUDQ and PADDQ instructions ... and i'm seeing a 30%+ 1024-bit sign/s improvement on the p4 systems i've measured on. but i'm too lazy to try to understand the perl asm generation crud, and don't want to figure out how/if you folks would prefer to switch between routines optimized for specific platforms. so what i present here is a total hack patch against a .s file. do with as you please :) note that i use %mm registers rather than %xmm registers because this code is completely dominated by the carry-propagation, which is a series of 64-bit adds followed by shift-right 32 ... and if you attempt to do this with 128-bit registers you waste a lot of slots mucking about packing and shuffling and such. even still, this is SSE2-only code because the PMULUDQ and PADDQ instructions don't exist in MMX/SSE. (which means the only released processors it will run on are p4 and banias^Wpentium-m... it shows similar improvements on unreleased processors i can't talk about :) if you look closely i'm doing only 32-bit loads and stores ... the implicit zero-extension on the 32-bit load beats out any sort of creative attempt to do 64-bit loads and shuffle the halves around. it's unlikely that this technique can speed up the simple add/sub routines -- unless there are situations where multiple add/sub could be done in parallel... in the MMX hardware you can effectively parallelize non-dependent carry propagation -- something you can't do in the ALUs due to the conflict on EFLAGS.CF. this code probably still has slack which could be improved on... such as moving the emms somewhere much higher in the call stack... it's required before any fp code is run. and rearranging the loop so that it overlaps multiplication better with the carry chain propagation. -dean p.s. i'm not on the mailing list, so please CC me in any reply. --- openssl-0.9.7a/crypto/bn/asm/bn86-elf.s 2003-03-23 21:29:16.000000000 -0800 +++ openssl-0.9.7a/crypto/bn/asm/bn86-elf.s.dg2 2003-03-23 21:18:05.000000000 -0800 @@ -26,94 +26,76 @@ movl 32(%esp), %ebp pushl %ecx jz .L000maw_finish -.L001maw_loop: - movl %ecx, (%esp) - movl (%ebx), %eax - mull %ebp - addl %esi, %eax - movl (%edi), %esi - adcl $0, %edx - addl %esi, %eax - adcl $0, %edx - movl %eax, (%edi) - movl %edx, %esi + movd %ebp,%mm0 + pxor %mm1,%mm1 /* mm1 = carry in */ - movl 4(%ebx), %eax - mull %ebp - addl %esi, %eax - movl 4(%edi), %esi - adcl $0, %edx - addl %esi, %eax - adcl $0, %edx - movl %eax, 4(%edi) - movl %edx, %esi - - movl 8(%ebx), %eax - mull %ebp - addl %esi, %eax - movl 8(%edi), %esi - adcl $0, %edx - addl %esi, %eax - adcl $0, %edx - movl %eax, 8(%edi) - movl %edx, %esi - - movl 12(%ebx), %eax - mull %ebp - addl %esi, %eax - movl 12(%edi), %esi - adcl $0, %edx - addl %esi, %eax - adcl $0, %edx - movl %eax, 12(%edi) - movl %edx, %esi - - movl 16(%ebx), %eax - mull %ebp - addl %esi, %eax - movl 16(%edi), %esi - adcl $0, %edx - addl %esi, %eax - adcl $0, %edx - movl %eax, 16(%edi) - movl %edx, %esi - - movl 20(%ebx), %eax - mull %ebp - addl %esi, %eax - movl 20(%edi), %esi - adcl $0, %edx - addl %esi, %eax - adcl $0, %edx - movl %eax, 20(%edi) - movl %edx, %esi - - movl 24(%ebx), %eax - mull %ebp - addl %esi, %eax - movl 24(%edi), %esi - adcl $0, %edx - addl %esi, %eax - adcl $0, %edx - movl %eax, 24(%edi) - movl %edx, %esi - - movl 28(%ebx), %eax - mull %ebp - addl %esi, %eax - movl 28(%edi), %esi - adcl $0, %edx - addl %esi, %eax - adcl $0, %edx - movl %eax, 28(%edi) - movl %edx, %esi +.L001maw_loop: + movd (%edi),%mm3 /* mm3 = C[0] */ + paddq %mm3,%mm1 /* mm1 = carry_in + C[0] */ + movd (%ebx),%mm2 /* mm2 = A[0] */ + pmuludq %mm0,%mm2 /* mm2 = scale*A[0] */ + movd 4(%ebx),%mm4 /* mm4 = A[1] */ + pmuludq %mm0,%mm4 /* mm4 = scale*A[1] */ + movd 8(%ebx),%mm6 /* mm6 = A[2] */ + pmuludq %mm0,%mm6 /* mm6 = scale*A[2] */ + movd 12(%ebx),%mm7 /* mm7 = A[3] */ + pmuludq %mm0,%mm7 /* mm7 = scale*A[3] */ + paddq %mm2,%mm1 /* mm1 = carry_in + C[0] + scale*A[0] */ + movd 4(%edi),%mm3 /* mm3 = C[1] */ + paddq %mm4,%mm3 /* mm3 = C[1] + scale*A[1] */ + movd 8(%edi),%mm5 /* mm5 = C[2] */ + paddq %mm6,%mm5 /* mm5 = C[2] + scale*A[2] */ + movd 12(%edi),%mm4 /* mm4 = C[3] */ + paddq %mm4,%mm7 /* mm7 = C[3] + scale*A[3] */ + movd %mm1,(%edi) + movd 16(%ebx),%mm2 /* mm2 = A[4] */ + pmuludq %mm0,%mm2 /* mm2 = scale*A[4] */ + psrlq $32,%mm1 /* mm1 = carry0 */ + movd 20(%ebx),%mm4 /* mm4 = A[5] */ + pmuludq %mm0,%mm4 /* mm4 = scale*A[5] */ + paddq %mm3,%mm1 /* mm1 = carry0 + C[1] + scale*A[1] */ + movd 24(%ebx),%mm6 /* mm6 = A[6] */ + pmuludq %mm0,%mm6 /* mm6 = scale*A[6] */ + movd %mm1,4(%edi) + psrlq $32,%mm1 /* mm1 = carry1 */ + movd 28(%ebx),%mm3 /* mm3 = A[7] */ + add $32,%ebx + pmuludq %mm0,%mm3 /* mm3 = scale*A[7] */ + paddq %mm5,%mm1 /* mm1 = carry1 + C[2] + scale*A[2] */ + movd 16(%edi),%mm5 /* mm5 = C[4] */ + paddq %mm5,%mm2 /* mm2 = C[4] + scale*A[4] */ + movd %mm1,8(%edi) + psrlq $32,%mm1 /* mm1 = carry2 */ + paddq %mm7,%mm1 /* mm1 = carry2 + C[3] + scale*A[3] */ + movd 20(%edi),%mm5 /* mm5 = C[5] */ + paddq %mm5,%mm4 /* mm4 = C[5] + scale*A[5] */ + movd %mm1,12(%edi) + psrlq $32,%mm1 /* mm1 = carry3 */ + paddq %mm2,%mm1 /* mm1 = carry3 + C[4] + scale*A[4] */ + movd 24(%edi),%mm5 /* mm5 = C[6] */ + paddq %mm5,%mm6 /* mm6 = C[6] + scale*A[6] */ + movd %mm1,16(%edi) + psrlq $32,%mm1 /* mm1 = carry4 */ + paddq %mm4,%mm1 /* mm1 = carry4 + C[5] + scale*A[5] */ + movd 28(%edi),%mm5 /* mm5 = C[7] */ + paddq %mm5,%mm3 /* mm3 = C[7] + scale*A[7] */ + movd %mm1,20(%edi) + psrlq $32,%mm1 /* mm1 = carry5 */ + paddq %mm6,%mm1 /* mm1 = carry5 + C[6] + scale*A[6] */ + movd %mm1,24(%edi) + psrlq $32,%mm1 /* mm1 = carry6 */ + paddq %mm3,%mm1 /* mm1 = carry6 + C[7] + scale*A[7] */ + movd %mm1,28(%edi) + add $32,%edi + psrlq $32,%mm1 /* mm1 = carry_out */ + + sub $8,%ecx + jne .L001maw_loop + movd %mm1,%esi /* esi = carry_out */ + emms + mov %ecx,(%esp) - movl (%esp), %ecx - addl $32, %ebx - addl $32, %edi - subl $8, %ecx - jnz .L001maw_loop .L000maw_finish: movl 32(%esp), %ecx andl $7, %ecx ______________________________________________________________________ OpenSSL Project http://www.openssl.org Development Mailing List [EMAIL PROTECTED] Automated List Manager [EMAIL PROTECTED]