[PATCH 5/7] sparc: Add support for SHA{1,256,512} opcodes.

David Miller Wed, 19 Sep 2012 20:43:47 -0700

The following measurements of "openssl speed sha" were taken on a
SPARC-T4.


Baseline (OPENSSL_sparcv9cap=0):

type             16 bytes     64 bytes    256 bytes   1024 bytes   8192 bytes
sha1             24322.20k    71309.59k   153340.94k   216593.41k   246923.26k
sha256           17516.97k    40926.25k    72628.65k    89984.68k    96938.67k
sha512           13219.95k    52850.94k    80323.41k   112189.44k   127052.46k

With SHA opcodes enabled:

type             16 bytes     64 bytes    256 bytes   1024 bytes   8192 bytes
sha1             33231.78k   115492.48k   318273.91k   579320.83k   759701.50k
sha256           46641.41k   157805.85k   419859.54k   708643.16k   889514.67k
sha512           50184.57k   202770.99k   529172.57k  1023763.11k  1405414.06k

Signed-off-by: David S. Miller <[email protected]>
---
 crypto/sha/Makefile              |   13 ++-
 crypto/sha/asm/sha1-sparcv9.pl   |   70 +++++++++++++++
 crypto/sha/asm/sha512-sparcv9.pl |  175 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 255 insertions(+), 3 deletions(-)

diff --git a/crypto/sha/Makefile b/crypto/sha/Makefile
index 79be651..06031b7 100644
--- a/crypto/sha/Makefile
+++ b/crypto/sha/Makefile
@@ -62,13 +62,20 @@ sha256-armv4.S: asm/sha256-armv4.pl
 sha1-alpha.s:  asm/sha1-alpha.pl
        $(PERL) $< | $(CC) -E - | tee $@ > /dev/null
 
+sha1-sparcv9.s: sha1-sparcv9.S
+       $(CC) $(CFLAGS) -E sha1-sparcv9.S > $@
+sha256-sparcv9.s: sha256-sparcv9.S
+       $(CC) $(CFLAGS) -E sha256-sparcv9.S > $@
+sha512-sparcv9.s: sha512-sparcv9.S
+       $(CC) $(CFLAGS) -E sha512-sparcv9.S > $@
+
 # Solaris make has to be explicitly told
 sha1-x86_64.s: asm/sha1-x86_64.pl;     $(PERL) asm/sha1-x86_64.pl 
$(PERLASM_SCHEME) > $@
 sha256-x86_64.s:asm/sha512-x86_64.pl;  $(PERL) asm/sha512-x86_64.pl 
$(PERLASM_SCHEME) $@
 sha512-x86_64.s:asm/sha512-x86_64.pl;  $(PERL) asm/sha512-x86_64.pl 
$(PERLASM_SCHEME) $@
-sha1-sparcv9.s:        asm/sha1-sparcv9.pl;    $(PERL) asm/sha1-sparcv9.pl $@ 
$(CFLAGS)
-sha256-sparcv9.s:asm/sha512-sparcv9.pl;        $(PERL) asm/sha512-sparcv9.pl 
$@ $(CFLAGS)
-sha512-sparcv9.s:asm/sha512-sparcv9.pl;        $(PERL) asm/sha512-sparcv9.pl 
$@ $(CFLAGS)
+sha1-sparcv9.S:        asm/sha1-sparcv9.pl;    $(PERL) asm/sha1-sparcv9.pl $@ 
$(CFLAGS)
+sha256-sparcv9.S:asm/sha512-sparcv9.pl;        $(PERL) asm/sha512-sparcv9.pl 
$@ $(CFLAGS)
+sha512-sparcv9.S:asm/sha512-sparcv9.pl;        $(PERL) asm/sha512-sparcv9.pl 
$@ $(CFLAGS)
 
 sha1-ppc.s:    asm/sha1-ppc.pl;        $(PERL) asm/sha1-ppc.pl 
$(PERLASM_SCHEME) $@
 sha256-ppc.s:  asm/sha512-ppc.pl;      $(PERL) asm/sha512-ppc.pl 
$(PERLASM_SCHEME) $@
diff --git a/crypto/sha/asm/sha1-sparcv9.pl b/crypto/sha/asm/sha1-sparcv9.pl
index 5c161ce..f3bf479 100644
--- a/crypto/sha/asm/sha1-sparcv9.pl
+++ b/crypto/sha/asm/sha1-sparcv9.pl
@@ -183,11 +183,81 @@ $code.=<<___ if ($bits==64);
 .register      %g3,#scratch
 ___
 $code.=<<___;
+#include "sparc_arch.h"
+
 .section       ".text",#alloc,#execinstr
 
+SPARC_PIC_THUNK(g2)
+
 .align 32
 .globl sha1_block_data_order
 sha1_block_data_order:
+       SPARC_LOAD_V9_CAPS_LEAF(g2, g1)
+       andcc   %g2, SPARCV9_SHA1, %g0
+       be      .Lsoftware
+        nop
+
+       ld      [%o0 + 0x00], %f0
+       ld      [%o0 + 0x04], %f1
+       ld      [%o0 + 0x08], %f2
+       andcc   %o1, 0x7, %g0
+       ld      [%o0 + 0x0c], %f3
+       bne     .Lhwunaligned
+        ld     [%o0 + 0x10], %f4
+.Lhwloop_aligned:
+       ldd     [%o1 + 0x00], %f8
+       ldd     [%o1 + 0x08], %f10
+       ldd     [%o1 + 0x10], %f12
+       ldd     [%o1 + 0x18], %f14
+       ldd     [%o1 + 0x20], %f16
+       ldd     [%o1 + 0x28], %f18
+       ldd     [%o1 + 0x30], %f20
+       ldd     [%o1 + 0x38], %f22
+       .word   0x81b02820              ! SHA1
+       subcc   %o2, 1, %o2
+       bne     .Lhwloop_aligned
+        add    %o1, 0x40, %o1
+.Lhwfinish:
+       st      %f0, [%o0 + 0x00]
+       st      %f1, [%o0 + 0x04]
+       st      %f2, [%o0 + 0x08]
+       st      %f3, [%o0 + 0x0c]
+       retl
+        st     %f4, [%o0 + 0x10]
+
+.Lhwunaligned:
+       alignaddr %o1, %g0, %o1
+
+       ldd     [%o1 + 0x00], %f10
+.Lhwloop_unaligned:
+       ldd     [%o1 + 0x08], %f12
+       ldd     [%o1 + 0x10], %f14
+       ldd     [%o1 + 0x18], %f16
+       ldd     [%o1 + 0x20], %f18
+       ldd     [%o1 + 0x28], %f20
+       ldd     [%o1 + 0x30], %f22
+       ldd     [%o1 + 0x38], %f24
+       ldd     [%o1 + 0x40], %f26
+
+       faligndata %f10, %f12, %f8
+       faligndata %f12, %f14, %f10
+       faligndata %f14, %f16, %f12
+       faligndata %f16, %f18, %f14
+       faligndata %f18, %f20, %f16
+       faligndata %f20, %f22, %f18
+       faligndata %f22, %f24, %f20
+       faligndata %f24, %f26, %f22
+
+       .word   0x81b02820              ! SHA1
+
+       subcc   %o2, 1, %o2
+       fsrc2   %f26, %f10
+       bne     .Lhwloop_unaligned
+        add    %o1, 0x40, %o1
+       b       .Lhwfinish
+        nop
+
+.Lsoftware:
        save    %sp,-$frame,%sp
        sllx    $len,6,$len
        add     $inp,$len,$len
diff --git a/crypto/sha/asm/sha512-sparcv9.pl b/crypto/sha/asm/sha512-sparcv9.pl
index e728d6e..e5781fe 100644
--- a/crypto/sha/asm/sha512-sparcv9.pl
+++ b/crypto/sha/asm/sha512-sparcv9.pl
@@ -387,8 +387,12 @@ $code.=<<___ if ($bits==64);
 .register      %g3,#scratch
 ___
 $code.=<<___;
+#include "sparc_arch.h"
+
 .section       ".text",#alloc,#execinstr
 
+SPARC_PIC_THUNK(g2)
+
 .align 64
 K${label}:
 .type  K${label},#object
@@ -460,6 +464,177 @@ $code.=<<___;
 .size  K${label},.-K${label}
 .globl sha${label}_block_data_order
 sha${label}_block_data_order:
+       SPARC_LOAD_V9_CAPS_LEAF(g2, g1)
+       set     SPARCV9_SHA${label}, %g1
+       andcc   %g2, %g1, %g0
+       be      .Lsoftware
+        nop
+___
+$code.=<<___ if ($SZ==8); # SHA512
+       ldd     [%o0 + 0x00], %f0
+       ldd     [%o0 + 0x08], %f2
+       ldd     [%o0 + 0x10], %f4
+       ldd     [%o0 + 0x18], %f6
+       ldd     [%o0 + 0x20], %f8
+       ldd     [%o0 + 0x28], %f10
+       andcc   %o1, 0x7, %g0
+       ldd     [%o0 + 0x30], %f12
+       bne,pn  %xcc, .Lhwunaligned
+        ldd    [%o0 + 0x38], %f14
+
+.Lhwaligned_loop:
+       ldd     [%o1 + 0x00], %f16
+       ldd     [%o1 + 0x08], %f18
+       ldd     [%o1 + 0x10], %f20
+       ldd     [%o1 + 0x18], %f22
+       ldd     [%o1 + 0x20], %f24
+       ldd     [%o1 + 0x28], %f26
+       ldd     [%o1 + 0x30], %f28
+       ldd     [%o1 + 0x38], %f30
+       ldd     [%o1 + 0x40], %f32
+       ldd     [%o1 + 0x48], %f34
+       ldd     [%o1 + 0x50], %f36
+       ldd     [%o1 + 0x58], %f38
+       ldd     [%o1 + 0x60], %f40
+       ldd     [%o1 + 0x68], %f42
+       ldd     [%o1 + 0x70], %f44
+       ldd     [%o1 + 0x78], %f46
+
+       .word   0x81b02860      ! SHA512
+
+       subcc   %o2, 1, %o2
+       bne,pt  %icc, .Lhwaligned_loop
+        add    %o1, 0x80, %o1
+.Lhwfinish:
+       std     %f0, [%o0 + 0x00]
+       std     %f2, [%o0 + 0x08]
+       std     %f4, [%o0 + 0x10]
+       std     %f6, [%o0 + 0x18]
+       std     %f8, [%o0 + 0x20]
+       std     %f10, [%o0 + 0x28]
+       std     %f12, [%o0 + 0x30]
+       retl
+        std    %f14, [%o0 + 0x38]
+.Lhwunaligned:
+       alignaddr %o1, %g0, %o1
+
+       ldd     [%o1 + 0x00], %f18
+.Lhwunaligned_loop:
+       ldd     [%o1 + 0x08], %f20
+       ldd     [%o1 + 0x10], %f22
+       ldd     [%o1 + 0x18], %f24
+       ldd     [%o1 + 0x20], %f26
+       ldd     [%o1 + 0x28], %f28
+       ldd     [%o1 + 0x30], %f30
+       ldd     [%o1 + 0x38], %f32
+       ldd     [%o1 + 0x40], %f34
+       ldd     [%o1 + 0x48], %f36
+       ldd     [%o1 + 0x50], %f38
+       ldd     [%o1 + 0x58], %f40
+       ldd     [%o1 + 0x60], %f42
+       ldd     [%o1 + 0x68], %f44
+       ldd     [%o1 + 0x70], %f46
+       ldd     [%o1 + 0x78], %f48
+       ldd     [%o1 + 0x80], %f50
+
+       faligndata %f18, %f20, %f16
+       faligndata %f20, %f22, %f18
+       faligndata %f22, %f24, %f20
+       faligndata %f24, %f26, %f22
+       faligndata %f26, %f28, %f24
+       faligndata %f28, %f30, %f26
+       faligndata %f30, %f32, %f28
+       faligndata %f32, %f34, %f30
+       faligndata %f34, %f36, %f32
+       faligndata %f36, %f38, %f34
+       faligndata %f38, %f40, %f36
+       faligndata %f40, %f42, %f38
+       faligndata %f42, %f44, %f40
+       faligndata %f44, %f46, %f42
+       faligndata %f46, %f48, %f44
+       faligndata %f48, %f50, %f46
+
+       .word   0x81b02860      ! SHA512
+
+       subcc   %o2, 1, %o2
+       fsrc2   %f50, %f18
+       bne,pt  %icc, .Lhwunaligned_loop
+        add    %o1, 0x80, %o1
+
+       ba,a,pt %xcc, .Lhwfinish
+___
+$code.=<<___ if ($SZ==4); # SHA256
+       ld      [%o0 + 0x00], %f0
+       ld      [%o0 + 0x04], %f1
+       ld      [%o0 + 0x08], %f2
+       ld      [%o0 + 0x0c], %f3
+       ld      [%o0 + 0x10], %f4
+       ld      [%o0 + 0x14], %f5
+       andcc   %o1, 0x7, %g0
+       ld      [%o0 + 0x18], %f6
+       bne,pn  %xcc, .Lhwunaligned
+        ld     [%o0 + 0x1c], %f7
+
+.Lhwloop:
+       ldd     [%o1 + 0x00], %f8
+       ldd     [%o1 + 0x08], %f10
+       ldd     [%o1 + 0x10], %f12
+       ldd     [%o1 + 0x18], %f14
+       ldd     [%o1 + 0x20], %f16
+       ldd     [%o1 + 0x28], %f18
+       ldd     [%o1 + 0x30], %f20
+       ldd     [%o1 + 0x38], %f22
+
+       .word   0x81b02840      ! SHA256
+
+       subcc   %o2, 1, %o2
+       bne,pt  %icc, .Lhwloop
+        add    %o1, 0x40, %o1
+
+.Lhwfinish:
+       st      %f0, [%o0 + 0x00]
+       st      %f1, [%o0 + 0x04]
+       st      %f2, [%o0 + 0x08]
+       st      %f3, [%o0 + 0x0c]
+       st      %f4, [%o0 + 0x10]
+       st      %f5, [%o0 + 0x14]
+       st      %f6, [%o0 + 0x18]
+       retl
+        st     %f7, [%o0 + 0x1c]
+.Lhwunaligned:
+       alignaddr %o1, %g0, %o1
+
+       ldd     [%o1 + 0x00], %f10
+.Lhwunaligned_loop:
+       ldd     [%o1 + 0x08], %f12
+       ldd     [%o1 + 0x10], %f14
+       ldd     [%o1 + 0x18], %f16
+       ldd     [%o1 + 0x20], %f18
+       ldd     [%o1 + 0x28], %f20
+       ldd     [%o1 + 0x30], %f22
+       ldd     [%o1 + 0x38], %f24
+       ldd     [%o1 + 0x40], %f26
+
+       faligndata %f10, %f12, %f8
+       faligndata %f12, %f14, %f10
+       faligndata %f14, %f16, %f12
+       faligndata %f16, %f18, %f14
+       faligndata %f18, %f20, %f16
+       faligndata %f20, %f22, %f18
+       faligndata %f22, %f24, %f20
+       faligndata %f24, %f26, %f22
+
+       .word   0x81b02840      ! SHA256
+
+       subcc   %o2, 1, %o2
+       fsrc2   %f26, %f10
+       bne,pt  %icc, .Lhwunaligned_loop
+        add    %o1, 0x40, %o1
+
+       ba,a,pt %xcc, .Lhwfinish
+___
+$code.=<<___;
+.Lsoftware:
        save    %sp,`-$frame-$locals`,%sp
        and     $inp,`$align-1`,$tmp31
        sllx    $len,`log(16*$SZ)/log(2)`,$len
-- 
1.7.10.4

______________________________________________________________________
OpenSSL Project                                 http://www.openssl.org
Development Mailing List                       [email protected]
Automated List Manager                           [email protected]

[PATCH 5/7] sparc: Add support for SHA{1,256,512} opcodes.

Reply via email to