[PATCH] SSE2 inner loop for bn_mul_add_words

dean gaudet Thu, 19 Jun 2003 23:19:47 -0700

hi there, i tried sending this ages ago but i guess some spam filters
probably lost it... i see i have to be subscribed to post stuff.


on a p4 2.4GHz, here's the before:

                  sign    verify    sign/s verify/s
rsa 1024 bits   0.0044s   0.0002s    225.5   4139.4

and after:

                  sign    verify    sign/s verify/s
rsa 1024 bits   0.0033s   0.0002s    306.7   6264.2

see hacked patch below.

-dean

Date: Sun, 23 Mar 2003 22:08:25 -0800 (PST)
From: dean gaudet <[EMAIL PROTECTED]>
To: [EMAIL PROTECTED]
Subject: SSE2 inner loop for bn_mul_add_words

for kicks i decided to see if it really was possible to get RSA speedups
using the SSE2 PMULUDQ and PADDQ instructions ... and i'm seeing a 30%+
1024-bit sign/s improvement on the p4 systems i've measured on.

but i'm too lazy to try to understand the perl asm generation crud, and
don't want to figure out how/if you folks would prefer to switch between
routines optimized for specific platforms.  so what i present here is a
total hack patch against a .s file.  do with as you please :)

note that i use %mm registers rather than %xmm registers because this
code is completely dominated by the carry-propagation, which is a series
of 64-bit adds followed by shift-right 32 ... and if you attempt to
do this with 128-bit registers you waste a lot of slots mucking about
packing and shuffling and such.

even still, this is SSE2-only code because the PMULUDQ and PADDQ
instructions don't exist in MMX/SSE.  (which means the only released
processors it will run on are p4 and banias^Wpentium-m... it shows
similar improvements on unreleased processors i can't talk about :)

if you look closely i'm doing only 32-bit loads and stores ... the
implicit zero-extension on the 32-bit load beats out any sort of creative
attempt to do 64-bit loads and shuffle the halves around.

it's unlikely that this technique can speed up the simple add/sub
routines -- unless there are situations where multiple add/sub could be
done in parallel... in the MMX hardware you can effectively parallelize
non-dependent carry propagation -- something you can't do in the ALUs
due to the conflict on EFLAGS.CF.

this code probably still has slack which could be improved on...  such as
moving the emms somewhere much higher in the call stack... it's required
before any fp code is run.  and rearranging the loop so that it overlaps
multiplication better with the carry chain propagation.

-dean

p.s. i'm not on the mailing list, so please CC me in any reply.

--- openssl-0.9.7a/crypto/bn/asm/bn86-elf.s     2003-03-23 21:29:16.000000000 -0800
+++ openssl-0.9.7a/crypto/bn/asm/bn86-elf.s.dg2 2003-03-23 21:18:05.000000000 -0800
@@ -26,94 +26,76 @@
        movl    32(%esp),       %ebp
        pushl   %ecx
        jz      .L000maw_finish
-.L001maw_loop:
-       movl    %ecx,           (%esp)

-       movl    (%ebx),         %eax
-       mull    %ebp
-       addl    %esi,           %eax
-       movl    (%edi),         %esi
-       adcl    $0,             %edx
-       addl    %esi,           %eax
-       adcl    $0,             %edx
-       movl    %eax,           (%edi)
-       movl    %edx,           %esi
+       movd %ebp,%mm0
+       pxor %mm1,%mm1          /* mm1 = carry in */

-       movl    4(%ebx),        %eax
-       mull    %ebp
-       addl    %esi,           %eax
-       movl    4(%edi),        %esi
-       adcl    $0,             %edx
-       addl    %esi,           %eax
-       adcl    $0,             %edx
-       movl    %eax,           4(%edi)
-       movl    %edx,           %esi
-
-       movl    8(%ebx),        %eax
-       mull    %ebp
-       addl    %esi,           %eax
-       movl    8(%edi),        %esi
-       adcl    $0,             %edx
-       addl    %esi,           %eax
-       adcl    $0,             %edx
-       movl    %eax,           8(%edi)
-       movl    %edx,           %esi
-
-       movl    12(%ebx),       %eax
-       mull    %ebp
-       addl    %esi,           %eax
-       movl    12(%edi),       %esi
-       adcl    $0,             %edx
-       addl    %esi,           %eax
-       adcl    $0,             %edx
-       movl    %eax,           12(%edi)
-       movl    %edx,           %esi
-
-       movl    16(%ebx),       %eax
-       mull    %ebp
-       addl    %esi,           %eax
-       movl    16(%edi),       %esi
-       adcl    $0,             %edx
-       addl    %esi,           %eax
-       adcl    $0,             %edx
-       movl    %eax,           16(%edi)
-       movl    %edx,           %esi
-
-       movl    20(%ebx),       %eax
-       mull    %ebp
-       addl    %esi,           %eax
-       movl    20(%edi),       %esi
-       adcl    $0,             %edx
-       addl    %esi,           %eax
-       adcl    $0,             %edx
-       movl    %eax,           20(%edi)
-       movl    %edx,           %esi
-
-       movl    24(%ebx),       %eax
-       mull    %ebp
-       addl    %esi,           %eax
-       movl    24(%edi),       %esi
-       adcl    $0,             %edx
-       addl    %esi,           %eax
-       adcl    $0,             %edx
-       movl    %eax,           24(%edi)
-       movl    %edx,           %esi
-
-       movl    28(%ebx),       %eax
-       mull    %ebp
-       addl    %esi,           %eax
-       movl    28(%edi),       %esi
-       adcl    $0,             %edx
-       addl    %esi,           %eax
-       adcl    $0,             %edx
-       movl    %eax,           28(%edi)
-       movl    %edx,           %esi
+.L001maw_loop:
+       movd (%edi),%mm3        /* mm3 = C[0]                           */
+       paddq %mm3,%mm1         /* mm1 = carry_in + C[0]                */
+       movd (%ebx),%mm2        /* mm2 = A[0]                           */
+       pmuludq %mm0,%mm2       /* mm2 = scale*A[0]                     */
+       movd 4(%ebx),%mm4       /* mm4 = A[1]                           */
+       pmuludq %mm0,%mm4       /* mm4 = scale*A[1]                     */
+       movd 8(%ebx),%mm6       /* mm6 = A[2]                           */
+       pmuludq %mm0,%mm6       /* mm6 = scale*A[2]                     */
+       movd 12(%ebx),%mm7      /* mm7 = A[3]                           */
+       pmuludq %mm0,%mm7       /* mm7 = scale*A[3]                     */
+       paddq %mm2,%mm1         /* mm1 = carry_in + C[0] + scale*A[0]   */
+       movd 4(%edi),%mm3       /* mm3 = C[1]                           */
+       paddq %mm4,%mm3         /* mm3 = C[1] + scale*A[1]              */
+       movd 8(%edi),%mm5       /* mm5 = C[2]                           */
+       paddq %mm6,%mm5         /* mm5 = C[2] + scale*A[2]              */
+       movd 12(%edi),%mm4      /* mm4 = C[3]                           */
+       paddq %mm4,%mm7         /* mm7 = C[3] + scale*A[3]              */
+       movd %mm1,(%edi)
+       movd 16(%ebx),%mm2      /* mm2 = A[4]                           */
+       pmuludq %mm0,%mm2       /* mm2 = scale*A[4]                     */
+       psrlq $32,%mm1          /* mm1 = carry0                 */
+       movd 20(%ebx),%mm4      /* mm4 = A[5]                           */
+       pmuludq %mm0,%mm4       /* mm4 = scale*A[5]                     */
+       paddq %mm3,%mm1         /* mm1 = carry0 + C[1] + scale*A[1]     */
+       movd 24(%ebx),%mm6      /* mm6 = A[6]                           */
+       pmuludq %mm0,%mm6       /* mm6 = scale*A[6]                     */
+       movd %mm1,4(%edi)
+       psrlq $32,%mm1          /* mm1 = carry1                 */
+       movd 28(%ebx),%mm3      /* mm3 = A[7]                           */
+       add $32,%ebx
+       pmuludq %mm0,%mm3       /* mm3 = scale*A[7]                     */
+       paddq %mm5,%mm1         /* mm1 = carry1 + C[2] + scale*A[2]     */
+       movd 16(%edi),%mm5      /* mm5 = C[4]                           */
+       paddq %mm5,%mm2         /* mm2 = C[4] + scale*A[4]              */
+       movd %mm1,8(%edi)
+       psrlq $32,%mm1          /* mm1 = carry2                 */
+       paddq %mm7,%mm1         /* mm1 = carry2 + C[3] + scale*A[3]     */
+       movd 20(%edi),%mm5      /* mm5 = C[5]                           */
+       paddq %mm5,%mm4         /* mm4 = C[5] + scale*A[5]              */
+       movd %mm1,12(%edi)
+       psrlq $32,%mm1          /* mm1 = carry3                 */
+       paddq %mm2,%mm1         /* mm1 = carry3 + C[4] + scale*A[4]     */
+       movd 24(%edi),%mm5      /* mm5 = C[6]                           */
+       paddq %mm5,%mm6         /* mm6 = C[6] + scale*A[6]              */
+       movd %mm1,16(%edi)
+       psrlq $32,%mm1          /* mm1 = carry4                 */
+       paddq %mm4,%mm1         /* mm1 = carry4 + C[5] + scale*A[5]     */
+       movd 28(%edi),%mm5      /* mm5 = C[7]                           */
+       paddq %mm5,%mm3         /* mm3 = C[7] + scale*A[7]              */
+       movd %mm1,20(%edi)
+       psrlq $32,%mm1          /* mm1 = carry5                 */
+       paddq %mm6,%mm1         /* mm1 = carry5 + C[6] + scale*A[6]     */
+       movd %mm1,24(%edi)
+       psrlq $32,%mm1          /* mm1 = carry6                 */
+       paddq %mm3,%mm1         /* mm1 = carry6 + C[7] + scale*A[7]     */
+       movd %mm1,28(%edi)
+       add $32,%edi
+       psrlq $32,%mm1          /* mm1 = carry_out                      */
+
+       sub $8,%ecx
+       jne .L001maw_loop
+       movd %mm1,%esi          /* esi = carry_out */
+       emms
+       mov %ecx,(%esp)

-       movl    (%esp),         %ecx
-       addl    $32,            %ebx
-       addl    $32,            %edi
-       subl    $8,             %ecx
-       jnz     .L001maw_loop
 .L000maw_finish:
        movl    32(%esp),       %ecx
        andl    $7,             %ecx
______________________________________________________________________
OpenSSL Project                                 http://www.openssl.org
Development Mailing List                       [EMAIL PROTECTED]
Automated List Manager                           [EMAIL PROTECTED]

[PATCH] SSE2 inner loop for bn_mul_add_words

Reply via email to