files

janholzh Tue, 02 Jun 2015 00:56:57 -0700

Revision: 25051
          http://sourceforge.net/p/gar/code/25051
Author:   janholzh
Date:     2015-06-02 07:53:16 +0000 (Tue, 02 Jun 2015)
Log Message:
-----------
openssl1/trunk: Oracle Moved pachtes around


Modified Paths:
--------------
    csw/mgar/pkg/openssl1/trunk/files/openssl-1.0.1m-t4-engine.sparc.5.11.patch
    csw/mgar/pkg/openssl1/trunk/files/openssl-1.0.1m-wanboot.patch
    csw/mgar/pkg/openssl1/trunk/files/update-t4-patch.sh
    csw/mgar/pkg/openssl1/trunk/files/update-wanboot-patch.sh

Modified: 
csw/mgar/pkg/openssl1/trunk/files/openssl-1.0.1m-t4-engine.sparc.5.11.patch
===================================================================
--- csw/mgar/pkg/openssl1/trunk/files/openssl-1.0.1m-t4-engine.sparc.5.11.patch 
2015-06-02 07:41:26 UTC (rev 25050)
+++ csw/mgar/pkg/openssl1/trunk/files/openssl-1.0.1m-t4-engine.sparc.5.11.patch 
2015-06-02 07:53:16 UTC (rev 25051)
@@ -2227,3 +2227,5563 @@
      {ERR_FUNC(EVP_F_AES_XTS), "AES_XTS"},
      {ERR_FUNC(EVP_F_AES_XTS_CIPHER), "AES_XTS_CIPHER"},
      {ERR_FUNC(EVP_F_ALG_MODULE_INIT), "ALG_MODULE_INIT"},
+Index: crypto/sparc_arch.h
+===================================================================
+diff -uNr openssl-1.0.1m/crypto/sparc_arch.h openssl-1.0.1m/crypto/sparc_arch.h
+--- openssl-1.0.1m/crypto/sparc_arch.h 1970-01-01 01:00:00.000000000 +0100
++++ openssl-1.0.1m/crypto/sparc_arch.h 2015-06-02 09:52:11.809148396 +0200
+@@ -0,0 +1,101 @@
++#ifndef __SPARC_ARCH_H__
++#define       __SPARC_ARCH_H__
++
++#define       SPARCV9_TICK_PRIVILEGED (1<<0)
++#define       SPARCV9_PREFER_FPU      (1<<1)
++#define       SPARCV9_VIS1            (1<<2)
++#define       SPARCV9_VIS2            (1<<3)  /* reserved */
++#define       SPARCV9_FMADD           (1<<4)  /* reserved for SPARC64 V */
++#define       SPARCV9_BLK             (1<<5)  /* VIS1 block copy */
++#define       SPARCV9_VIS3            (1<<6)
++#define       SPARCV9_RANDOM          (1<<7)
++#define       SPARCV9_64BIT_STACK     (1<<8)
++
++/*
++ * OPENSSL_sparcv9cap_P[1] is copy of Compatibility Feature Register,
++ * %asr26, SPARC-T4 and later. There is no SPARCV9_CFR bit in
++ * OPENSSL_sparcv9cap_P[0], as %cfr copy is sufficient...
++ */
++#define       CFR_AES         0x00000001 /* Supports AES opcodes      */
++#define       CFR_DES         0x00000002 /* Supports DES opcodes      */
++#define       CFR_KASUMI      0x00000004 /* Supports KASUMI opcodes   */
++#define       CFR_CAMELLIA    0x00000008 /* Supports CAMELLIA opcodes */
++#define       CFR_MD5         0x00000010 /* Supports MD5 opcodes      */
++#define       CFR_SHA1        0x00000020 /* Supports SHA1 opcodes     */
++#define       CFR_SHA256      0x00000040 /* Supports SHA256 opcodes   */
++#define       CFR_SHA512      0x00000080 /* Supports SHA512 opcodes   */
++#define       CFR_MPMUL       0x00000100 /* Supports MPMUL opcodes    */
++#define       CFR_MONTMUL     0x00000200 /* Supports MONTMUL opcodes  */
++#define       CFR_MONTSQR     0x00000400 /* Supports MONTSQR opcodes  */
++#define       CFR_CRC32C      0x00000800 /* Supports CRC32C opcodes   */
++
++#if defined(OPENSSL_PIC) && !defined(__PIC__)
++#define       __PIC__
++#endif
++
++#if defined(__SUNPRO_C) && defined(__sparcv9) && !defined(__arch64__)
++#define       __arch64__
++#endif
++
++#define       SPARC_PIC_THUNK(reg)    \
++      .align  32;             \
++.Lpic_thunk:                  \
++      jmp     %o7 + 8;        \
++      add     %o7, reg, reg;
++
++#define       SPARC_PIC_THUNK_CALL(reg)                       \
++      sethi   %hi(_GLOBAL_OFFSET_TABLE_-4), reg;      \
++      call    .Lpic_thunk;                            \
++      or      reg, %lo(_GLOBAL_OFFSET_TABLE_+4), reg;
++
++#if 1
++#define       SPARC_SETUP_GOT_REG(reg)        SPARC_PIC_THUNK_CALL(reg)
++#else
++#define       SPARC_SETUP_GOT_REG(reg)        \
++      sethi   %hi(_GLOBAL_OFFSET_TABLE_-4), reg;      \
++      call    .+8;                                    \
++      or      reg, %lo(_GLOBAL_OFFSET_TABLE_+4), reg; \
++      add     %o7, reg, reg
++#endif
++
++#if defined(__arch64__)
++
++#define       SPARC_LOAD_ADDRESS(SYM, reg)    \
++      setx    SYM, %o7, reg;
++#define       LDPTR           ldx
++#define       SIZE_T_CC       %xcc
++#define       STACK_FRAME     192
++#define       STACK_BIAS      2047
++#define       STACK_7thARG    (STACK_BIAS+176)
++
++#else
++
++#define       SPARC_LOAD_ADDRESS(SYM, reg)    \
++      set     SYM, reg;
++#define       LDPTR           ld
++#define       SIZE_T_CC       %icc
++#define       STACK_FRAME     112
++#define       STACK_BIAS      0
++#define       STACK_7thARG    92
++#define       SPARC_LOAD_ADDRESS_LEAF(SYM, reg, tmp) SPARC_LOAD_ADDRESS(SYM, 
reg)
++
++#endif
++
++#ifdef __PIC__
++#undef        SPARC_LOAD_ADDRESS
++#undef SPARC_LOAD_ADDRESS_LEAF
++#define       SPARC_LOAD_ADDRESS(SYM, reg)    \
++      SPARC_SETUP_GOT_REG(reg);       \
++      sethi   %hi(SYM), %o7;          \
++      or      %o7, %lo(SYM), %o7;     \
++      LDPTR   [reg + %o7], reg;
++#endif
++
++#ifndef SPARC_LOAD_ADDRESS_LEAF
++#define       SPARC_LOAD_ADDRESS_LEAF(SYM, reg, tmp)  \
++      mov     %o7, tmp;                       \
++      SPARC_LOAD_ADDRESS(SYM, reg)            \
++      mov     tmp, %o7;
++#endif
++
++#endif        /* __SPARC_ARCH_H__ */
+Index: crypto/md5/asm/md5-sparcv9.pl
+===================================================================
+diff -uNr openssl-1.0.1m/crypto/md5/asm/md5-sparcv9.pl 
openssl-1.0.1m/crypto/md5/asm/md5-sparcv9.pl
+--- openssl-1.0.1m/crypto/md5/asm/md5-sparcv9.pl 1970-01-01 01:00:00.000000000 
+0100
++++ openssl-1.0.1m/crypto/md5/asm/md5-sparcv9.pl 2015-06-02 09:52:11.809148396 
+0200
+@@ -0,0 +1,434 @@
++#!/usr/bin/env perl
++
++# ====================================================================
++# Written by Andy Polyakov <[email protected]> for the OpenSSL
++# project. The module is, however, dual licensed under OpenSSL and
++# CRYPTOGAMS licenses depending on where you obtain it. For further
++# details see http://www.openssl.org/~appro/cryptogams/.
++#
++# Hardware SPARC T4 support by David S. Miller <[email protected]>.
++# ====================================================================
++
++# MD5 for SPARCv9, 6.9 cycles per byte on UltraSPARC, >40% faster than
++# code generated by Sun C 5.2.
++
++# SPARC T4 MD5 hardware achieves 3.20 cycles per byte, which is 2.1x
++# faster than software. Multi-process benchmark saturates at 12x
++# single-process result on 8-core processor, or ~11GBps per 2.85GHz
++# socket.
++
++$bits=32;
++for (@ARGV)   { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
++if ($bits==64)        { $bias=2047; $frame=192; }
++else          { $bias=0;    $frame=112; }
++
++$output=shift;
++open STDOUT,">$output";
++
++use integer;
++
++($ctx,$inp,$len)=("%i0","%i1","%i2"); # input arguments
++
++# 64-bit values
++@X=("%o0","%o1","%o2","%o3","%o4","%o5","%o7","%g1","%g2");
++$tx="%g3";
++($AB,$CD)=("%g4","%g5");
++
++# 32-bit values
++@V=($A,$B,$C,$D)=map("%l$_",(0..3));
++($t1,$t2,$t3,$saved_asi)=map("%l$_",(4..7));
++($shr,$shl1,$shl2)=("%i3","%i4","%i5");
++
++my @K=(       0xd76aa478,0xe8c7b756,0x242070db,0xc1bdceee,
++      0xf57c0faf,0x4787c62a,0xa8304613,0xfd469501,
++      0x698098d8,0x8b44f7af,0xffff5bb1,0x895cd7be,
++      0x6b901122,0xfd987193,0xa679438e,0x49b40821,
++
++      0xf61e2562,0xc040b340,0x265e5a51,0xe9b6c7aa,
++      0xd62f105d,0x02441453,0xd8a1e681,0xe7d3fbc8,
++      0x21e1cde6,0xc33707d6,0xf4d50d87,0x455a14ed,
++      0xa9e3e905,0xfcefa3f8,0x676f02d9,0x8d2a4c8a,
++
++      0xfffa3942,0x8771f681,0x6d9d6122,0xfde5380c,
++      0xa4beea44,0x4bdecfa9,0xf6bb4b60,0xbebfbc70,
++      0x289b7ec6,0xeaa127fa,0xd4ef3085,0x04881d05,
++      0xd9d4d039,0xe6db99e5,0x1fa27cf8,0xc4ac5665,
++
++      0xf4292244,0x432aff97,0xab9423a7,0xfc93a039,
++      0x655b59c3,0x8f0ccc92,0xffeff47d,0x85845dd1,
++      0x6fa87e4f,0xfe2ce6e0,0xa3014314,0x4e0811a1,
++      0xf7537e82,0xbd3af235,0x2ad7d2bb,0xeb86d391, 0  );
++
++sub R0 {
++  my ($i,$a,$b,$c,$d) = @_;
++  my $rot = (7,12,17,22)[$i%4];
++  my $j   = ($i+1)/2;
++
++  if ($i&1) {
++    $code.=<<___;
++       srlx   @X[$j],$shr,@X[$j]      ! align X[`$i+1`]
++      and     $b,$t1,$t1              ! round $i
++       sllx   @X[$j+1],$shl1,$tx
++      add     $t2,$a,$a
++       sllx   $tx,$shl2,$tx
++      xor     $d,$t1,$t1
++       or     $tx,@X[$j],@X[$j]
++       sethi  %hi(@K[$i+1]),$t2
++      add     $t1,$a,$a
++       or     $t2,%lo(@K[$i+1]),$t2
++      sll     $a,$rot,$t3
++       add    @X[$j],$t2,$t2          ! X[`$i+1`]+K[`$i+1`]
++      srl     $a,32-$rot,$a
++      add     $b,$t3,$t3
++       xor     $b,$c,$t1
++      add     $t3,$a,$a
++___
++  } else {
++    $code.=<<___;
++       srlx   @X[$j],32,$tx           ! extract X[`2*$j+1`]
++      and     $b,$t1,$t1              ! round $i
++      add     $t2,$a,$a
++      xor     $d,$t1,$t1
++       sethi  %hi(@K[$i+1]),$t2
++      add     $t1,$a,$a
++       or     $t2,%lo(@K[$i+1]),$t2
++      sll     $a,$rot,$t3
++       add    $tx,$t2,$t2             ! X[`2*$j+1`]+K[`$i+1`]
++      srl     $a,32-$rot,$a
++      add     $b,$t3,$t3
++       xor     $b,$c,$t1
++      add     $t3,$a,$a
++___
++  }
++}
++
++sub R0_1 {
++  my ($i,$a,$b,$c,$d) = @_;
++  my $rot = (7,12,17,22)[$i%4];
++
++$code.=<<___;
++       srlx   @X[0],32,$tx            ! extract X[1]
++      and     $b,$t1,$t1              ! round $i
++      add     $t2,$a,$a
++      xor     $d,$t1,$t1
++       sethi  %hi(@K[$i+1]),$t2
++      add     $t1,$a,$a
++       or     $t2,%lo(@K[$i+1]),$t2
++      sll     $a,$rot,$t3
++       add    $tx,$t2,$t2             ! X[1]+K[`$i+1`]
++      srl     $a,32-$rot,$a
++      add     $b,$t3,$t3
++       andn    $b,$c,$t1
++      add     $t3,$a,$a
++___
++}
++
++sub R1 {
++  my ($i,$a,$b,$c,$d) = @_;
++  my $rot = (5,9,14,20)[$i%4];
++  my $j   = $i<31 ? (1+5*($i+1))%16 : (5+3*($i+1))%16;
++  my $xi  = @X[$j/2];
++
++$code.=<<___ if ($j&1 && ($xi=$tx));
++       srlx   @X[$j/2],32,$xi         ! extract X[$j]
++___
++$code.=<<___;
++      and     $b,$d,$t3               ! round $i
++      add     $t2,$a,$a
++      or      $t3,$t1,$t1
++       sethi  %hi(@K[$i+1]),$t2
++      add     $t1,$a,$a
++       or     $t2,%lo(@K[$i+1]),$t2
++      sll     $a,$rot,$t3
++       add    $xi,$t2,$t2             ! X[$j]+K[`$i+1`]
++      srl     $a,32-$rot,$a
++      add     $b,$t3,$t3
++       `$i<31?"andn":"xor"`    $b,$c,$t1
++      add     $t3,$a,$a
++___
++}
++
++sub R2 {
++  my ($i,$a,$b,$c,$d) = @_;
++  my $rot = (4,11,16,23)[$i%4];
++  my $j   = $i<47 ? (5+3*($i+1))%16 : (0+7*($i+1))%16;
++  my $xi  = @X[$j/2];
++
++$code.=<<___ if ($j&1 && ($xi=$tx));
++       srlx   @X[$j/2],32,$xi         ! extract X[$j]
++___
++$code.=<<___;
++      add     $t2,$a,$a               ! round $i
++      xor     $b,$t1,$t1
++       sethi  %hi(@K[$i+1]),$t2
++      add     $t1,$a,$a
++       or     $t2,%lo(@K[$i+1]),$t2
++      sll     $a,$rot,$t3
++       add    $xi,$t2,$t2             ! X[$j]+K[`$i+1`]
++      srl     $a,32-$rot,$a
++      add     $b,$t3,$t3
++       xor     $b,$c,$t1
++      add     $t3,$a,$a
++___
++}
++
++sub R3 {
++  my ($i,$a,$b,$c,$d) = @_;
++  my $rot = (6,10,15,21)[$i%4];
++  my $j   = (0+7*($i+1))%16;
++  my $xi  = @X[$j/2];
++
++$code.=<<___;
++      add     $t2,$a,$a               ! round $i
++___
++$code.=<<___ if ($j&1 && ($xi=$tx));
++       srlx   @X[$j/2],32,$xi         ! extract X[$j]
++___
++$code.=<<___;
++      orn     $b,$d,$t1
++       sethi  %hi(@K[$i+1]),$t2
++      xor     $c,$t1,$t1
++       or     $t2,%lo(@K[$i+1]),$t2
++      add     $t1,$a,$a
++      sll     $a,$rot,$t3
++       add    $xi,$t2,$t2             ! X[$j]+K[`$i+1`]
++      srl     $a,32-$rot,$a
++      add     $b,$t3,$t3
++      add     $t3,$a,$a
++___
++}
++
++$code.=<<___ if ($bits==64);
++.register     %g2,#scratch
++.register     %g3,#scratch
++___
++$code.=<<___;
++#include "sparc_arch.h"
++
++.section      ".text",#alloc,#execinstr
++
++#ifdef __PIC__
++SPARC_PIC_THUNK(%g1)
++#endif
++
++.globl        md5_block_asm_data_order
++.align        32
++md5_block_asm_data_order:
++      SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
++      ld      [%g1+4],%g1             ! OPENSSL_sparcv9cap_P[1]
++
++      andcc   %g1, CFR_MD5, %g0
++      be      .Lsoftware
++      nop
++
++      mov     4, %g1
++      andcc   %o1, 0x7, %g0
++      lda     [%o0 + %g0]0x88, %f0            ! load context
++      lda     [%o0 + %g1]0x88, %f1
++      add     %o0, 8, %o0
++      lda     [%o0 + %g0]0x88, %f2
++      lda     [%o0 + %g1]0x88, %f3
++      bne,pn  %icc, .Lhwunaligned
++      sub     %o0, 8, %o0
++
++.Lhw_loop:
++      ldd     [%o1 + 0x00], %f8
++      ldd     [%o1 + 0x08], %f10
++      ldd     [%o1 + 0x10], %f12
++      ldd     [%o1 + 0x18], %f14
++      ldd     [%o1 + 0x20], %f16
++      ldd     [%o1 + 0x28], %f18
++      ldd     [%o1 + 0x30], %f20
++      subcc   %o2, 1, %o2             ! done yet? 
++      ldd     [%o1 + 0x38], %f22
++      add     %o1, 0x40, %o1
++      prefetch [%o1 + 63], 20
++
++      .word   0x81b02800              ! MD5
++
++      bne,pt  `$bits==64?"%xcc":"%icc"`, .Lhw_loop
++      nop
++
++.Lhwfinish:
++      sta     %f0, [%o0 + %g0]0x88    ! store context
++      sta     %f1, [%o0 + %g1]0x88
++      add     %o0, 8, %o0
++      sta     %f2, [%o0 + %g0]0x88
++      sta     %f3, [%o0 + %g1]0x88
++      retl
++      nop
++
++.align        8
++.Lhwunaligned:
++      alignaddr %o1, %g0, %o1
++
++      ldd     [%o1 + 0x00], %f10
++.Lhwunaligned_loop:
++      ldd     [%o1 + 0x08], %f12
++      ldd     [%o1 + 0x10], %f14
++      ldd     [%o1 + 0x18], %f16
++      ldd     [%o1 + 0x20], %f18
++      ldd     [%o1 + 0x28], %f20
++      ldd     [%o1 + 0x30], %f22
++      ldd     [%o1 + 0x38], %f24
++      subcc   %o2, 1, %o2             ! done yet?
++      ldd     [%o1 + 0x40], %f26
++      add     %o1, 0x40, %o1
++      prefetch [%o1 + 63], 20
++
++      faligndata %f10, %f12, %f8
++      faligndata %f12, %f14, %f10
++      faligndata %f14, %f16, %f12
++      faligndata %f16, %f18, %f14
++      faligndata %f18, %f20, %f16
++      faligndata %f20, %f22, %f18
++      faligndata %f22, %f24, %f20
++      faligndata %f24, %f26, %f22
++
++      .word   0x81b02800              ! MD5
++
++      bne,pt  `$bits==64?"%xcc":"%icc"`, .Lhwunaligned_loop
++      for     %f26, %f26, %f10        ! %f10=%f26
++
++      ba      .Lhwfinish
++      nop
++
++.align        16
++.Lsoftware:
++      save    %sp,-$frame,%sp
++
++      rd      %asi,$saved_asi
++      wr      %g0,0x88,%asi           ! ASI_PRIMARY_LITTLE
++      and     $inp,7,$shr
++      andn    $inp,7,$inp
++
++      sll     $shr,3,$shr             ! *=8
++      mov     56,$shl2
++      ld      [$ctx+0],$A
++      sub     $shl2,$shr,$shl2
++      ld      [$ctx+4],$B
++      and     $shl2,32,$shl1
++      add     $shl2,8,$shl2
++      ld      [$ctx+8],$C
++      sub     $shl2,$shl1,$shl2       ! shr+shl1+shl2==64
++      ld      [$ctx+12],$D
++      nop
++
++.Loop:
++       cmp    $shr,0                  ! was inp aligned?
++      ldxa    [$inp+0]%asi,@X[0]      ! load little-endian input
++      ldxa    [$inp+8]%asi,@X[1]
++      ldxa    [$inp+16]%asi,@X[2]
++      ldxa    [$inp+24]%asi,@X[3]
++      ldxa    [$inp+32]%asi,@X[4]
++       sllx   $A,32,$AB               ! pack A,B
++      ldxa    [$inp+40]%asi,@X[5]
++       sllx   $C,32,$CD               ! pack C,D
++      ldxa    [$inp+48]%asi,@X[6]
++       or     $B,$AB,$AB
++      ldxa    [$inp+56]%asi,@X[7]
++       or     $D,$CD,$CD
++      bnz,a,pn        %icc,.+8
++      ldxa    [$inp+64]%asi,@X[8]
++
++      srlx    @X[0],$shr,@X[0]        ! align X[0]
++      sllx    @X[1],$shl1,$tx
++       sethi  %hi(@K[0]),$t2
++      sllx    $tx,$shl2,$tx
++       or     $t2,%lo(@K[0]),$t2
++      or      $tx,@X[0],@X[0]
++       xor    $C,$D,$t1
++       add    @X[0],$t2,$t2           ! X[0]+K[0]
++___
++      for ($i=0;$i<15;$i++)   { &R0($i,@V);   unshift(@V,pop(@V)); }
++      for (;$i<16;$i++)       { &R0_1($i,@V); unshift(@V,pop(@V)); }
++      for (;$i<32;$i++)       { &R1($i,@V);   unshift(@V,pop(@V)); }
++      for (;$i<48;$i++)       { &R2($i,@V);   unshift(@V,pop(@V)); }
++      for (;$i<64;$i++)       { &R3($i,@V);   unshift(@V,pop(@V)); }
++$code.=<<___;
++      srlx    $AB,32,$t1              ! unpack A,B,C,D and accumulate
++      add     $inp,64,$inp            ! advance inp
++      srlx    $CD,32,$t2
++      add     $t1,$A,$A
++      subcc   $len,1,$len             ! done yet?
++      add     $AB,$B,$B
++      add     $t2,$C,$C
++      add     $CD,$D,$D
++      srl     $B,0,$B                 ! clruw $B
++      bne     `$bits==64?"%xcc":"%icc"`,.Loop
++      srl     $D,0,$D                 ! clruw $D
++
++      st      $A,[$ctx+0]             ! write out ctx
++      st      $B,[$ctx+4]
++      st      $C,[$ctx+8]
++      st      $D,[$ctx+12]
++
++      wr      %g0,$saved_asi,%asi
++      ret
++      restore
++.type md5_block_asm_data_order,#function
++.size md5_block_asm_data_order,(.-md5_block_asm_data_order)
++
++.asciz        "MD5 block transform for SPARCv9, CRYPTOGAMS by 
<appro\@openssl.org>"
++.align        4
++___
++
++# Purpose of these subroutines is to explicitly encode VIS instructions,
++# so that one can compile the module without having to specify VIS
++# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
++# Idea is to reserve for option to produce "universal" binary and let
++# programmer detect if current CPU is VIS capable at run-time.
++sub unvis {
++my ($mnemonic,$rs1,$rs2,$rd)=@_;
++my $ref,$opf;
++my %visopf = (        "faligndata"    => 0x048,
++              "for"           => 0x07c        );
++
++    $ref = "$mnemonic\t$rs1,$rs2,$rd";
++
++    if ($opf=$visopf{$mnemonic}) {
++      foreach ($rs1,$rs2,$rd) {
++          return $ref if (!/%f([0-9]{1,2})/);
++          $_=$1;
++          if ($1>=32) {
++              return $ref if ($1&1);
++              # re-encode for upper double register addressing
++              $_=($1|$1>>5)&31;
++          }
++      }
++
++      return  sprintf ".word\t0x%08x !%s",
++                      0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
++                      $ref;
++    } else {
++      return $ref;
++    }
++}
++sub unalignaddr {
++my ($mnemonic,$rs1,$rs2,$rd)=@_;
++my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
++my $ref="$mnemonic\t$rs1,$rs2,$rd";
++
++    foreach ($rs1,$rs2,$rd) {
++      if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
++      else                    { return $ref; }
++    }
++    return  sprintf ".word\t0x%08x !%s",
++                  0x81b00300|$rd<<25|$rs1<<14|$rs2,
++                  $ref;
++}
++
++foreach (split("\n",$code)) {
++      s/\`([^\`]*)\`/eval $1/ge;
++
++      s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
++              &unvis($1,$2,$3,$4)
++       /ge;
++      s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
++              &unalignaddr($1,$2,$3,$4)
++       /ge;
++
++      print $_,"\n";
++}
++
++close STDOUT;
+Index: crypto/aes/asm/aest4-sparcv9.pl
+===================================================================
+diff -uNr openssl-1.0.1m/crypto/aes/asm/aest4-sparcv9.pl 
openssl-1.0.1m/crypto/aes/asm/aest4-sparcv9.pl
+--- openssl-1.0.1m/crypto/aes/asm/aest4-sparcv9.pl 1970-01-01 
01:00:00.000000000 +0100
++++ openssl-1.0.1m/crypto/aes/asm/aest4-sparcv9.pl 2015-06-02 
09:52:11.809148396 +0200
+@@ -0,0 +1,902 @@
++#!/usr/bin/env perl
++
++# ====================================================================
++# Written by David S. Miller <[email protected]> and Andy Polyakov
++# <[email protected]>. The module is licensed under 2-clause BSD
++# license. October 2012. All rights reserved.
++# ====================================================================
++
++######################################################################
++# AES for SPARC T4.
++#
++# AES round instructions complete in 3 cycles and can be issued every
++# cycle. It means that round calculations should take 4*rounds cycles,
++# because any given round instruction depends on result of *both*
++# previous instructions:
++#
++#     |0 |1 |2 |3 |4
++#     |01|01|01|
++#        |23|23|23|
++#                 |01|01|...
++#                    |23|...
++#
++# Provided that fxor [with IV] takes 3 cycles to complete, critical
++# path length for CBC encrypt would be 3+4*rounds, or in other words
++# it should process one byte in at least (3+4*rounds)/16 cycles. This
++# estimate doesn't account for "collateral" instructions, such as
++# fetching input from memory, xor-ing it with zero-round key and
++# storing the result. Yet, *measured* performance [for data aligned
++# at 64-bit boundary!] deviates from this equation by less than 0.5%:
++#
++#             128-bit key     192-            256-
++# CBC encrypt 2.70/2.90(*)    3.20/3.40       3.70/3.90
++#                      (*) numbers after slash are for
++#                          misaligned data.
++#
++# Out-of-order execution logic managed to fully overlap "collateral"
++# instructions with those on critical path. Amazing!
++#
++# As with Intel AES-NI, question is if it's possible to improve
++# performance of parallelizeable modes by interleaving round
++# instructions. Provided round instruction latency and throughput
++# optimal interleave factor is 2. But can we expect 2x performance
++# improvement? Well, as round instructions can be issued one per
++# cycle, they don't saturate the 2-way issue pipeline and therefore
++# there is room for "collateral" calculations... Yet, 2x speed-up
++# over CBC encrypt remains unattaintable:
++#
++#             128-bit key     192-            256-
++# CBC decrypt 1.64/2.11       1.89/2.37       2.23/2.61
++# CTR         1.64/2.08(*)    1.89/2.33       2.23/2.61
++#                      (*) numbers after slash are for
++#                          misaligned data.
++#
++# Estimates based on amount of instructions under assumption that
++# round instructions are not pairable with any other instruction
++# suggest that latter is the actual case and pipeline runs
++# underutilized. It should be noted that T4 out-of-order execution
++# logic is so capable that performance gain from 2x interleave is
++# not even impressive, ~7-13% over non-interleaved code, largest
++# for 256-bit keys.
++
++# To anchor to something else, software implementation processes
++# one byte in 29 cycles with 128-bit key on same processor. Intel
++# Sandy Bridge encrypts byte in 5.07 cycles in CBC mode and decrypts
++# in 0.93, naturally with AES-NI.
++
++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
++push(@INC,"${dir}","${dir}../../perlasm");
++require "sparcv9_modes.pl";
++
++&asm_init(@ARGV);
++
++$::evp=1;     # if $evp is set to 0, script generates module with
++# AES_[en|de]crypt, AES_set_[en|de]crypt_key and AES_cbc_encrypt entry
++# points. These however are not fully compatible with openssl/aes.h,
++# because they expect AES_KEY to be aligned at 64-bit boundary. When
++# used through EVP, alignment is arranged at EVP layer. Second thing
++# that is arranged by EVP is at least 32-bit alignment of IV.
++
++######################################################################
++# single-round subroutines
++#
++{
++my ($inp,$out,$key,$rounds,$tmp,$mask)=map("%o$_",(0..5));
++
++$code=<<___;
++.text
++
++.globl        aes_t4_encrypt
++.align        32
++aes_t4_encrypt:
++      andcc           $inp, 7, %g1            ! is input aligned?
++      andn            $inp, 7, $inp
++
++      ldx             [$key + 0], %g4
++      ldx             [$key + 8], %g5
++
++      ldx             [$inp + 0], %o4
++      bz,pt           %icc, 1f
++      ldx             [$inp + 8], %o5
++      ldx             [$inp + 16], $inp
++      sll             %g1, 3, %g1
++      sub             %g0, %g1, %o3
++      sllx            %o4, %g1, %o4
++      sllx            %o5, %g1, %g1
++      srlx            %o5, %o3, %o5
++      srlx            $inp, %o3, %o3
++      or              %o5, %o4, %o4
++      or              %o3, %g1, %o5
++1:
++      ld              [$key + 240], $rounds
++      ldd             [$key + 16], %f12
++      ldd             [$key + 24], %f14
++      xor             %g4, %o4, %o4
++      xor             %g5, %o5, %o5
++      movxtod         %o4, %f0
++      movxtod         %o5, %f2
++      srl             $rounds, 1, $rounds
++      ldd             [$key + 32], %f16
++      sub             $rounds, 1, $rounds
++      ldd             [$key + 40], %f18
++      add             $key, 48, $key
++
++.Lenc:
++      aes_eround01    %f12, %f0, %f2, %f4
++      aes_eround23    %f14, %f0, %f2, %f2
++      ldd             [$key + 0], %f12
++      ldd             [$key + 8], %f14
++      sub             $rounds,1,$rounds
++      aes_eround01    %f16, %f4, %f2, %f0
++      aes_eround23    %f18, %f4, %f2, %f2
++      ldd             [$key + 16], %f16
++      ldd             [$key + 24], %f18
++      brnz,pt         $rounds, .Lenc
++      add             $key, 32, $key
++
++      andcc           $out, 7, $tmp           ! is output aligned?
++      aes_eround01    %f12, %f0, %f2, %f4
++      aes_eround23    %f14, %f0, %f2, %f2
++      aes_eround01_l  %f16, %f4, %f2, %f0
++      aes_eround23_l  %f18, %f4, %f2, %f2
++
++      bnz,pn          %icc, 2f
++      nop
++
++      std             %f0, [$out + 0]
++      retl
++      std             %f2, [$out + 8]
++
++2:    alignaddrl      $out, %g0, $out
++      mov             0xff, $mask
++      srl             $mask, $tmp, $mask
++
++      faligndata      %f0, %f0, %f4
++      faligndata      %f0, %f2, %f6
++      faligndata      %f2, %f2, %f8
++
++      stda            %f4, [$out + $mask]0xc0 ! partial store
++      std             %f6, [$out + 8]
++      add             $out, 16, $out
++      orn             %g0, $mask, $mask
++      retl
++      stda            %f8, [$out + $mask]0xc0 ! partial store
++.type aes_t4_encrypt,#function
++.size aes_t4_encrypt,.-aes_t4_encrypt
++
++.globl        aes_t4_decrypt
++.align        32
++aes_t4_decrypt:
++      andcc           $inp, 7, %g1            ! is input aligned?
++      andn            $inp, 7, $inp
++
++      ldx             [$key + 0], %g4
++      ldx             [$key + 8], %g5
++
++      ldx             [$inp + 0], %o4
++      bz,pt           %icc, 1f
++      ldx             [$inp + 8], %o5
++      ldx             [$inp + 16], $inp
++      sll             %g1, 3, %g1
++      sub             %g0, %g1, %o3
++      sllx            %o4, %g1, %o4
++      sllx            %o5, %g1, %g1
++      srlx            %o5, %o3, %o5
++      srlx            $inp, %o3, %o3
++      or              %o5, %o4, %o4
++      or              %o3, %g1, %o5
++1:
++      ld              [$key + 240], $rounds
++      ldd             [$key + 16], %f12
++      ldd             [$key + 24], %f14
++      xor             %g4, %o4, %o4
++      xor             %g5, %o5, %o5
++      movxtod         %o4, %f0
++      movxtod         %o5, %f2
++      srl             $rounds, 1, $rounds
++      ldd             [$key + 32], %f16
++      sub             $rounds, 1, $rounds
++      ldd             [$key + 40], %f18
++      add             $key, 48, $key
++
++.Ldec:
++      aes_dround01    %f12, %f0, %f2, %f4
++      aes_dround23    %f14, %f0, %f2, %f2
++      ldd             [$key + 0], %f12
++      ldd             [$key + 8], %f14
++      sub             $rounds,1,$rounds
++      aes_dround01    %f16, %f4, %f2, %f0
++      aes_dround23    %f18, %f4, %f2, %f2
++      ldd             [$key + 16], %f16
++      ldd             [$key + 24], %f18
++      brnz,pt         $rounds, .Ldec
++      add             $key, 32, $key
++
++      andcc           $out, 7, $tmp           ! is output aligned?
++      aes_dround01    %f12, %f0, %f2, %f4
++      aes_dround23    %f14, %f0, %f2, %f2
++      aes_dround01_l  %f16, %f4, %f2, %f0
++      aes_dround23_l  %f18, %f4, %f2, %f2
++
++      bnz,pn          %icc, 2f
++      nop
++
++      std             %f0, [$out + 0]
++      retl
++      std             %f2, [$out + 8]
++
++2:    alignaddrl      $out, %g0, $out
++      mov             0xff, $mask
++      srl             $mask, $tmp, $mask
++
++      faligndata      %f0, %f0, %f4
++      faligndata      %f0, %f2, %f6
++      faligndata      %f2, %f2, %f8
++
++      stda            %f4, [$out + $mask]0xc0 ! partial store
++      std             %f6, [$out + 8]
++      add             $out, 16, $out
++      orn             %g0, $mask, $mask
++      retl
++      stda            %f8, [$out + $mask]0xc0 ! partial store
++.type aes_t4_decrypt,#function
++.size aes_t4_decrypt,.-aes_t4_decrypt
++___
++}
++
++######################################################################
++# key setup subroutines
++#
++{
++my ($inp,$bits,$out,$tmp)=map("%o$_",(0..5));
++$code.=<<___;
++.globl        aes_t4_set_encrypt_key
++.align        32
++aes_t4_set_encrypt_key:
++.Lset_encrypt_key:
++      and             $inp, 7, $tmp
++      alignaddr       $inp, %g0, $inp
++      cmp             $bits, 192
++      ldd             [$inp + 0], %f0
++      bl,pt           %icc,.L128
++      ldd             [$inp + 8], %f2
++
++      be,pt           %icc,.L192
++      ldd             [$inp + 16], %f4
++      brz,pt          $tmp, .L256aligned
++      ldd             [$inp + 24], %f6
++
++      ldd             [$inp + 32], %f8
++      faligndata      %f0, %f2, %f0
++      faligndata      %f2, %f4, %f2
++      faligndata      %f4, %f6, %f4
++      faligndata      %f6, %f8, %f6
++.L256aligned:
++___
++for ($i=0; $i<6; $i++) {
++    $code.=<<___;
++      std             %f0, [$out + `32*$i+0`]
++      aes_kexpand1    %f0, %f6, $i, %f0
++      std             %f2, [$out + `32*$i+8`]
++      aes_kexpand2    %f2, %f0, %f2
++      std             %f4, [$out + `32*$i+16`]
++      aes_kexpand0    %f4, %f2, %f4
++      std             %f6, [$out + `32*$i+24`]
++      aes_kexpand2    %f6, %f4, %f6
++___
++}
++$code.=<<___;
++      std             %f0, [$out + `32*$i+0`]
++      aes_kexpand1    %f0, %f6, $i, %f0
++      std             %f2, [$out + `32*$i+8`]
++      aes_kexpand2    %f2, %f0, %f2
++      std             %f4, [$out + `32*$i+16`]
++      std             %f6, [$out + `32*$i+24`]
++      std             %f0, [$out + `32*$i+32`]
++      std             %f2, [$out + `32*$i+40`]
++
++      mov             14, $tmp
++      st              $tmp, [$out + 240]
++      retl
++      xor             %o0, %o0, %o0
++
++.align        16
++.L192:
++      brz,pt          $tmp, .L192aligned
++      nop
++
++      ldd             [$inp + 24], %f6
++      faligndata      %f0, %f2, %f0
++      faligndata      %f2, %f4, %f2
++      faligndata      %f4, %f6, %f4
++.L192aligned:
++___
++for ($i=0; $i<7; $i++) {
++    $code.=<<___;
++      std             %f0, [$out + `24*$i+0`]
++      aes_kexpand1    %f0, %f4, $i, %f0
++      std             %f2, [$out + `24*$i+8`]
++      aes_kexpand2    %f2, %f0, %f2
++      std             %f4, [$out + `24*$i+16`]
++      aes_kexpand2    %f4, %f2, %f4
++___
++}
++$code.=<<___;
++      std             %f0, [$out + `24*$i+0`]
++      aes_kexpand1    %f0, %f4, $i, %f0
++      std             %f2, [$out + `24*$i+8`]
++      aes_kexpand2    %f2, %f0, %f2
++      std             %f4, [$out + `24*$i+16`]
++      std             %f0, [$out + `24*$i+24`]
++      std             %f2, [$out + `24*$i+32`]
++
++      mov             12, $tmp
++      st              $tmp, [$out + 240]
++      retl
++      xor             %o0, %o0, %o0
++
++.align        16
++.L128:
++      brz,pt          $tmp, .L128aligned
++      nop
++
++      ldd             [$inp + 16], %f4
++      faligndata      %f0, %f2, %f0
++      faligndata      %f2, %f4, %f2
++.L128aligned:
++___
++for ($i=0; $i<10; $i++) {
++    $code.=<<___;
++      std             %f0, [$out + `16*$i+0`]
++      aes_kexpand1    %f0, %f2, $i, %f0
++      std             %f2, [$out + `16*$i+8`]
++      aes_kexpand2    %f2, %f0, %f2
++___
++}
++$code.=<<___;
++      std             %f0, [$out + `16*$i+0`]
++      std             %f2, [$out + `16*$i+8`]
++
++      mov             10, $tmp
++      st              $tmp, [$out + 240]
++      retl
++      xor             %o0, %o0, %o0
++.type aes_t4_set_encrypt_key,#function
++.size aes_t4_set_encrypt_key,.-aes_t4_set_encrypt_key
++
++.globl        aes_t4_set_decrypt_key
++.align        32
++aes_t4_set_decrypt_key:
++      mov             %o7, %o5
++      call            .Lset_encrypt_key
++      nop
++
++      mov             %o5, %o7
++      sll             $tmp, 4, $inp           ! $tmp is number of rounds
++      add             $tmp, 2, $tmp
++      add             $out, $inp, $inp        ! $inp=$out+16*rounds
++      srl             $tmp, 2, $tmp           ! $tmp=(rounds+2)/4
++
++.Lkey_flip:
++      ldd             [$out + 0],  %f0
++      ldd             [$out + 8],  %f2
++      ldd             [$out + 16], %f4
++      ldd             [$out + 24], %f6
++      ldd             [$inp + 0],  %f8
++      ldd             [$inp + 8],  %f10
++      ldd             [$inp - 16], %f12
++      ldd             [$inp - 8],  %f14
++      sub             $tmp, 1, $tmp
++      std             %f0, [$inp + 0]
++      std             %f2, [$inp + 8]
++      std             %f4, [$inp - 16]
++      std             %f6, [$inp - 8]
++      std             %f8, [$out + 0]
++      std             %f10, [$out + 8]
++      std             %f12, [$out + 16]
++      std             %f14, [$out + 24]
++      add             $out, 32, $out
++      brnz            $tmp, .Lkey_flip
++      sub             $inp, 32, $inp
++
++      retl
++      xor             %o0, %o0, %o0
++.type aes_t4_set_decrypt_key,#function
++.size aes_t4_set_decrypt_key,.-aes_t4_set_decrypt_key
++___
++}
++
++{{{
++my ($inp,$out,$len,$key,$ivec,$enc)=map("%i$_",(0..5));
++my ($ileft,$iright,$ooff,$omask,$ivoff)=map("%l$_",(1..7));
++
++$code.=<<___;
++.align        32
++_aes128_loadkey:
++      ldx             [$key + 0], %g4
++      ldx             [$key + 8], %g5
++___
++for ($i=2; $i<22;$i++) {                      # load key schedule
++    $code.=<<___;
++      ldd             [$key + `8*$i`], %f`12+2*$i`
++___
++}
++$code.=<<___;
++      retl
++      nop
++.type _aes128_loadkey,#function
++.size _aes128_loadkey,.-_aes128_loadkey
++_aes128_load_enckey=_aes128_loadkey
++_aes128_load_deckey=_aes128_loadkey
++
++.align        32
++_aes128_encrypt_1x:
++___
++for ($i=0; $i<4; $i++) {
++    $code.=<<___;
++      aes_eround01    %f`16+8*$i+0`, %f0, %f2, %f4
++      aes_eround23    %f`16+8*$i+2`, %f0, %f2, %f2
++      aes_eround01    %f`16+8*$i+4`, %f4, %f2, %f0
++      aes_eround23    %f`16+8*$i+6`, %f4, %f2, %f2
++___
++}
++$code.=<<___;
++      aes_eround01    %f48, %f0, %f2, %f4
++      aes_eround23    %f50, %f0, %f2, %f2
++      aes_eround01_l  %f52, %f4, %f2, %f0
++      retl
++      aes_eround23_l  %f54, %f4, %f2, %f2
++.type _aes128_encrypt_1x,#function
++.size _aes128_encrypt_1x,.-_aes128_encrypt_1x
++
++.align        32
++_aes128_encrypt_2x:
++___
++for ($i=0; $i<4; $i++) {
++    $code.=<<___;
++      aes_eround01    %f`16+8*$i+0`, %f0, %f2, %f8
++      aes_eround23    %f`16+8*$i+2`, %f0, %f2, %f2
++      aes_eround01    %f`16+8*$i+0`, %f4, %f6, %f10
++      aes_eround23    %f`16+8*$i+2`, %f4, %f6, %f6
++      aes_eround01    %f`16+8*$i+4`, %f8, %f2, %f0
++      aes_eround23    %f`16+8*$i+6`, %f8, %f2, %f2
++      aes_eround01    %f`16+8*$i+4`, %f10, %f6, %f4
++      aes_eround23    %f`16+8*$i+6`, %f10, %f6, %f6
++___
++}
++$code.=<<___;
++      aes_eround01    %f48, %f0, %f2, %f8
++      aes_eround23    %f50, %f0, %f2, %f2
++      aes_eround01    %f48, %f4, %f6, %f10
++      aes_eround23    %f50, %f4, %f6, %f6
++      aes_eround01_l  %f52, %f8, %f2, %f0
++      aes_eround23_l  %f54, %f8, %f2, %f2
++      aes_eround01_l  %f52, %f10, %f6, %f4
++      retl
++      aes_eround23_l  %f54, %f10, %f6, %f6
++.type _aes128_encrypt_2x,#function
++.size _aes128_encrypt_2x,.-_aes128_encrypt_2x
++
++.align        32
++_aes128_decrypt_1x:
++___
++for ($i=0; $i<4; $i++) {
++    $code.=<<___;
++      aes_dround01    %f`16+8*$i+0`, %f0, %f2, %f4
++      aes_dround23    %f`16+8*$i+2`, %f0, %f2, %f2
++      aes_dround01    %f`16+8*$i+4`, %f4, %f2, %f0
++      aes_dround23    %f`16+8*$i+6`, %f4, %f2, %f2
++___
++}
++$code.=<<___;
++      aes_dround01    %f48, %f0, %f2, %f4
++      aes_dround23    %f50, %f0, %f2, %f2
++      aes_dround01_l  %f52, %f4, %f2, %f0
++      retl
++      aes_dround23_l  %f54, %f4, %f2, %f2
++.type _aes128_decrypt_1x,#function
++.size _aes128_decrypt_1x,.-_aes128_decrypt_1x
++
++.align        32
++_aes128_decrypt_2x:
++___
++for ($i=0; $i<4; $i++) {
++    $code.=<<___;
++      aes_dround01    %f`16+8*$i+0`, %f0, %f2, %f8
++      aes_dround23    %f`16+8*$i+2`, %f0, %f2, %f2
++      aes_dround01    %f`16+8*$i+0`, %f4, %f6, %f10
++      aes_dround23    %f`16+8*$i+2`, %f4, %f6, %f6
++      aes_dround01    %f`16+8*$i+4`, %f8, %f2, %f0
++      aes_dround23    %f`16+8*$i+6`, %f8, %f2, %f2
++      aes_dround01    %f`16+8*$i+4`, %f10, %f6, %f4
++      aes_dround23    %f`16+8*$i+6`, %f10, %f6, %f6
++___
++}
++$code.=<<___;
++      aes_dround01    %f48, %f0, %f2, %f8
++      aes_dround23    %f50, %f0, %f2, %f2
++      aes_dround01    %f48, %f4, %f6, %f10
++      aes_dround23    %f50, %f4, %f6, %f6
++      aes_dround01_l  %f52, %f8, %f2, %f0
++      aes_dround23_l  %f54, %f8, %f2, %f2
++      aes_dround01_l  %f52, %f10, %f6, %f4
++      retl
++      aes_dround23_l  %f54, %f10, %f6, %f6
++.type _aes128_decrypt_2x,#function
++.size _aes128_decrypt_2x,.-_aes128_decrypt_2x
++
++.align        32
++_aes192_loadkey:
++_aes256_loadkey:
++      ldx             [$key + 0], %g4
++      ldx             [$key + 8], %g5
++___
++for ($i=2; $i<26;$i++) {                      # load key schedule
++    $code.=<<___;
++      ldd             [$key + `8*$i`], %f`12+2*$i`
++___
++}
++$code.=<<___;
++      retl
++      nop
++.type _aes192_loadkey,#function
++.size _aes192_loadkey,.-_aes192_loadkey
++_aes192_load_enckey=_aes192_loadkey
++_aes192_load_deckey=_aes192_loadkey
++_aes256_load_enckey=_aes192_loadkey
++_aes256_load_deckey=_aes192_loadkey
++
++.align        32
++_aes192_encrypt_1x:
++___
++for ($i=0; $i<5; $i++) {
++    $code.=<<___;
++      aes_eround01    %f`16+8*$i+0`, %f0, %f2, %f4
++      aes_eround23    %f`16+8*$i+2`, %f0, %f2, %f2
++      aes_eround01    %f`16+8*$i+4`, %f4, %f2, %f0
++      aes_eround23    %f`16+8*$i+6`, %f4, %f2, %f2
++___
++}
++$code.=<<___;
++      aes_eround01    %f56, %f0, %f2, %f4
++      aes_eround23    %f58, %f0, %f2, %f2
++      aes_eround01_l  %f60, %f4, %f2, %f0
++      retl
++      aes_eround23_l  %f62, %f4, %f2, %f2
++.type _aes192_encrypt_1x,#function
++.size _aes192_encrypt_1x,.-_aes192_encrypt_1x
++
++.align        32
++_aes192_encrypt_2x:
++___
++for ($i=0; $i<5; $i++) {
++    $code.=<<___;
++      aes_eround01    %f`16+8*$i+0`, %f0, %f2, %f8
++      aes_eround23    %f`16+8*$i+2`, %f0, %f2, %f2
++      aes_eround01    %f`16+8*$i+0`, %f4, %f6, %f10
++      aes_eround23    %f`16+8*$i+2`, %f4, %f6, %f6
++      aes_eround01    %f`16+8*$i+4`, %f8, %f2, %f0
++      aes_eround23    %f`16+8*$i+6`, %f8, %f2, %f2
++      aes_eround01    %f`16+8*$i+4`, %f10, %f6, %f4
++      aes_eround23    %f`16+8*$i+6`, %f10, %f6, %f6
++___
++}
++$code.=<<___;
++      aes_eround01    %f56, %f0, %f2, %f8
++      aes_eround23    %f58, %f0, %f2, %f2
++      aes_eround01    %f56, %f4, %f6, %f10
++      aes_eround23    %f58, %f4, %f6, %f6
++      aes_eround01_l  %f60, %f8, %f2, %f0
++      aes_eround23_l  %f62, %f8, %f2, %f2
++      aes_eround01_l  %f60, %f10, %f6, %f4
++      retl
++      aes_eround23_l  %f62, %f10, %f6, %f6
++.type _aes192_encrypt_2x,#function
++.size _aes192_encrypt_2x,.-_aes192_encrypt_2x
++
++.align        32
++_aes192_decrypt_1x:
++___
++for ($i=0; $i<5; $i++) {
++    $code.=<<___;
++      aes_dround01    %f`16+8*$i+0`, %f0, %f2, %f4
++      aes_dround23    %f`16+8*$i+2`, %f0, %f2, %f2
++      aes_dround01    %f`16+8*$i+4`, %f4, %f2, %f0
++      aes_dround23    %f`16+8*$i+6`, %f4, %f2, %f2
++___
++}
++$code.=<<___;
++      aes_dround01    %f56, %f0, %f2, %f4
++      aes_dround23    %f58, %f0, %f2, %f2
++      aes_dround01_l  %f60, %f4, %f2, %f0
++      retl
++      aes_dround23_l  %f62, %f4, %f2, %f2
++.type _aes192_decrypt_1x,#function
++.size _aes192_decrypt_1x,.-_aes192_decrypt_1x
++
++.align        32
++_aes192_decrypt_2x:
++___
++for ($i=0; $i<5; $i++) {
++    $code.=<<___;
++      aes_dround01    %f`16+8*$i+0`, %f0, %f2, %f8
++      aes_dround23    %f`16+8*$i+2`, %f0, %f2, %f2
++      aes_dround01    %f`16+8*$i+0`, %f4, %f6, %f10
++      aes_dround23    %f`16+8*$i+2`, %f4, %f6, %f6
++      aes_dround01    %f`16+8*$i+4`, %f8, %f2, %f0
++      aes_dround23    %f`16+8*$i+6`, %f8, %f2, %f2
++      aes_dround01    %f`16+8*$i+4`, %f10, %f6, %f4
++      aes_dround23    %f`16+8*$i+6`, %f10, %f6, %f6
++___
++}
++$code.=<<___;
++      aes_dround01    %f56, %f0, %f2, %f8
++      aes_dround23    %f58, %f0, %f2, %f2
++      aes_dround01    %f56, %f4, %f6, %f10
++      aes_dround23    %f58, %f4, %f6, %f6
++      aes_dround01_l  %f60, %f8, %f2, %f0
++      aes_dround23_l  %f62, %f8, %f2, %f2
++      aes_dround01_l  %f60, %f10, %f6, %f4
++      retl
++      aes_dround23_l  %f62, %f10, %f6, %f6
++.type _aes192_decrypt_2x,#function
++.size _aes192_decrypt_2x,.-_aes192_decrypt_2x
++
++.align        32
++_aes256_encrypt_1x:
++      aes_eround01    %f16, %f0, %f2, %f4
++      aes_eround23    %f18, %f0, %f2, %f2
++      ldd             [$key + 208], %f16
++      ldd             [$key + 216], %f18
++      aes_eround01    %f20, %f4, %f2, %f0
++      aes_eround23    %f22, %f4, %f2, %f2
++      ldd             [$key + 224], %f20
++      ldd             [$key + 232], %f22
++___
++for ($i=1; $i<6; $i++) {
++    $code.=<<___;
++      aes_eround01    %f`16+8*$i+0`, %f0, %f2, %f4
++      aes_eround23    %f`16+8*$i+2`, %f0, %f2, %f2
++      aes_eround01    %f`16+8*$i+4`, %f4, %f2, %f0
++      aes_eround23    %f`16+8*$i+6`, %f4, %f2, %f2
++___
++}
++$code.=<<___;
++      aes_eround01    %f16, %f0, %f2, %f4
++      aes_eround23    %f18, %f0, %f2, %f2
++      ldd             [$key + 16], %f16
++      ldd             [$key + 24], %f18
++      aes_eround01_l  %f20, %f4, %f2, %f0
++      aes_eround23_l  %f22, %f4, %f2, %f2
++      ldd             [$key + 32], %f20
++      retl
++      ldd             [$key + 40], %f22
++.type _aes256_encrypt_1x,#function
++.size _aes256_encrypt_1x,.-_aes256_encrypt_1x
++
++.align        32
++_aes256_encrypt_2x:
++      aes_eround01    %f16, %f0, %f2, %f8
++      aes_eround23    %f18, %f0, %f2, %f2
++      aes_eround01    %f16, %f4, %f6, %f10
++      aes_eround23    %f18, %f4, %f6, %f6
++      ldd             [$key + 208], %f16
++      ldd             [$key + 216], %f18
++      aes_eround01    %f20, %f8, %f2, %f0
++      aes_eround23    %f22, %f8, %f2, %f2
++      aes_eround01    %f20, %f10, %f6, %f4
++      aes_eround23    %f22, %f10, %f6, %f6
++      ldd             [$key + 224], %f20
++      ldd             [$key + 232], %f22
++___
++for ($i=1; $i<6; $i++) {
++    $code.=<<___;
++      aes_eround01    %f`16+8*$i+0`, %f0, %f2, %f8
++      aes_eround23    %f`16+8*$i+2`, %f0, %f2, %f2
++      aes_eround01    %f`16+8*$i+0`, %f4, %f6, %f10
++      aes_eround23    %f`16+8*$i+2`, %f4, %f6, %f6
++      aes_eround01    %f`16+8*$i+4`, %f8, %f2, %f0
++      aes_eround23    %f`16+8*$i+6`, %f8, %f2, %f2
++      aes_eround01    %f`16+8*$i+4`, %f10, %f6, %f4
++      aes_eround23    %f`16+8*$i+6`, %f10, %f6, %f6
++___
++}
++$code.=<<___;
++      aes_eround01    %f16, %f0, %f2, %f8
++      aes_eround23    %f18, %f0, %f2, %f2
++      aes_eround01    %f16, %f4, %f6, %f10
++      aes_eround23    %f18, %f4, %f6, %f6
++      ldd             [$key + 16], %f16
++      ldd             [$key + 24], %f18
++      aes_eround01_l  %f20, %f8, %f2, %f0
++      aes_eround23_l  %f22, %f8, %f2, %f2
++      aes_eround01_l  %f20, %f10, %f6, %f4
++      aes_eround23_l  %f22, %f10, %f6, %f6
++      ldd             [$key + 32], %f20
++      retl
++      ldd             [$key + 40], %f22
++.type _aes256_encrypt_2x,#function
++.size _aes256_encrypt_2x,.-_aes256_encrypt_2x
++
++.align        32
++_aes256_decrypt_1x:
++      aes_dround01    %f16, %f0, %f2, %f4
++      aes_dround23    %f18, %f0, %f2, %f2
++      ldd             [$key + 208], %f16
++      ldd             [$key + 216], %f18
++      aes_dround01    %f20, %f4, %f2, %f0
++      aes_dround23    %f22, %f4, %f2, %f2
++      ldd             [$key + 224], %f20
++      ldd             [$key + 232], %f22
++___
++for ($i=1; $i<6; $i++) {
++    $code.=<<___;
++      aes_dround01    %f`16+8*$i+0`, %f0, %f2, %f4
++      aes_dround23    %f`16+8*$i+2`, %f0, %f2, %f2
++      aes_dround01    %f`16+8*$i+4`, %f4, %f2, %f0
++      aes_dround23    %f`16+8*$i+6`, %f4, %f2, %f2
++___
++}
++$code.=<<___;
++      aes_dround01    %f16, %f0, %f2, %f4
++      aes_dround23    %f18, %f0, %f2, %f2
++      ldd             [$key + 16], %f16
++      ldd             [$key + 24], %f18
++      aes_dround01_l  %f20, %f4, %f2, %f0
++      aes_dround23_l  %f22, %f4, %f2, %f2
++      ldd             [$key + 32], %f20
++      retl
++      ldd             [$key + 40], %f22
++.type _aes256_decrypt_1x,#function
++.size _aes256_decrypt_1x,.-_aes256_decrypt_1x
++
++.align        32
++_aes256_decrypt_2x:
++      aes_dround01    %f16, %f0, %f2, %f8
++      aes_dround23    %f18, %f0, %f2, %f2
++      aes_dround01    %f16, %f4, %f6, %f10
++      aes_dround23    %f18, %f4, %f6, %f6
++      ldd             [$key + 208], %f16
++      ldd             [$key + 216], %f18
++      aes_dround01    %f20, %f8, %f2, %f0
++      aes_dround23    %f22, %f8, %f2, %f2
++      aes_dround01    %f20, %f10, %f6, %f4
++      aes_dround23    %f22, %f10, %f6, %f6
++      ldd             [$key + 224], %f20
++      ldd             [$key + 232], %f22
++___
++for ($i=1; $i<6; $i++) {
++    $code.=<<___;
++      aes_dround01    %f`16+8*$i+0`, %f0, %f2, %f8
++      aes_dround23    %f`16+8*$i+2`, %f0, %f2, %f2
++      aes_dround01    %f`16+8*$i+0`, %f4, %f6, %f10
++      aes_dround23    %f`16+8*$i+2`, %f4, %f6, %f6
++      aes_dround01    %f`16+8*$i+4`, %f8, %f2, %f0
++      aes_dround23    %f`16+8*$i+6`, %f8, %f2, %f2
++      aes_dround01    %f`16+8*$i+4`, %f10, %f6, %f4
++      aes_dround23    %f`16+8*$i+6`, %f10, %f6, %f6
++___
++}
++$code.=<<___;
++      aes_dround01    %f16, %f0, %f2, %f8
++      aes_dround23    %f18, %f0, %f2, %f2
++      aes_dround01    %f16, %f4, %f6, %f10
++      aes_dround23    %f18, %f4, %f6, %f6
++      ldd             [$key + 16], %f16
++      ldd             [$key + 24], %f18
++      aes_dround01_l  %f20, %f8, %f2, %f0
++      aes_dround23_l  %f22, %f8, %f2, %f2
++      aes_dround01_l  %f20, %f10, %f6, %f4
++      aes_dround23_l  %f22, %f10, %f6, %f6
++      ldd             [$key + 32], %f20
++      retl
++      ldd             [$key + 40], %f22
++.type _aes256_decrypt_2x,#function
++.size _aes256_decrypt_2x,.-_aes256_decrypt_2x
++___
++
++&alg_cbc_encrypt_implement("aes",128);
++&alg_cbc_encrypt_implement("aes",192);
++&alg_cbc_encrypt_implement("aes",256);
++
++&alg_cbc_decrypt_implement("aes",128);
++&alg_cbc_decrypt_implement("aes",192);
++&alg_cbc_decrypt_implement("aes",256);
++
++if ($::evp) {
++    &alg_ctr32_implement("aes",128);
++    &alg_ctr32_implement("aes",192);
++    &alg_ctr32_implement("aes",256);
++}
++}}}
++
++if (!$::evp) {
++$code.=<<___;
++.global       AES_encrypt
++AES_encrypt=aes_t4_encrypt
++.global       AES_decrypt
++AES_decrypt=aes_t4_decrypt
++.global       AES_set_encrypt_key
++.align        32
++AES_set_encrypt_key:
++      andcc           %o2, 7, %g0             ! check alignment
++      bnz,a,pn        %icc, 1f
++      mov             -1, %o0
++      brz,a,pn        %o0, 1f
++      mov             -1, %o0
++      brz,a,pn        %o2, 1f
++      mov             -1, %o0
++      andncc          %o1, 0x1c0, %g0
++      bnz,a,pn        %icc, 1f
++      mov             -2, %o0
++      cmp             %o1, 128
++      bl,a,pn         %icc, 1f
++      mov             -2, %o0
++      b               aes_t4_set_encrypt_key
++      nop
++1:    retl
++      nop
++.type AES_set_encrypt_key,#function
++.size AES_set_encrypt_key,.-AES_set_encrypt_key
++
++.global       AES_set_decrypt_key
++.align        32
++AES_set_decrypt_key:
++      andcc           %o2, 7, %g0             ! check alignment
++      bnz,a,pn        %icc, 1f
++      mov             -1, %o0
++      brz,a,pn        %o0, 1f
++      mov             -1, %o0
++      brz,a,pn        %o2, 1f
++      mov             -1, %o0
++      andncc          %o1, 0x1c0, %g0
++      bnz,a,pn        %icc, 1f
++      mov             -2, %o0
++      cmp             %o1, 128
++      bl,a,pn         %icc, 1f
++      mov             -2, %o0
++      b               aes_t4_set_decrypt_key
++      nop
++1:    retl
++      nop
++.type AES_set_decrypt_key,#function
++.size AES_set_decrypt_key,.-AES_set_decrypt_key
++___
++
++my ($inp,$out,$len,$key,$ivec,$enc)=map("%o$_",(0..5));
++
++$code.=<<___;
++.globl        AES_cbc_encrypt
++.align        32
++AES_cbc_encrypt:
++      ld              [$key + 240], %g1
++      nop
++      brz             $enc, .Lcbc_decrypt
++      cmp             %g1, 12
++
++      bl,pt           %icc, aes128_t4_cbc_encrypt
++      nop
++      be,pn           %icc, aes192_t4_cbc_encrypt
++      nop
++      ba              aes256_t4_cbc_encrypt
++      nop
++
++.Lcbc_decrypt:
++      bl,pt           %icc, aes128_t4_cbc_decrypt
++      nop
++      be,pn           %icc, aes192_t4_cbc_decrypt
++      nop
++      ba              aes256_t4_cbc_decrypt
++      nop
++.type AES_cbc_encrypt,#function
++.size AES_cbc_encrypt,.-AES_cbc_encrypt
++___
++}
++$code.=<<___;
++.asciz        "AES for SPARC T4, David S. Miller, Andy Polyakov"
++.align        4
++___
++
++&emit_assembler();
++
++close STDOUT;
+Index: crypto/des/asm/dest4-sparcv9.pl
+===================================================================
+diff -uNr openssl-1.0.1m/crypto/des/asm/dest4-sparcv9.pl 
openssl-1.0.1m/crypto/des/asm/dest4-sparcv9.pl
+--- openssl-1.0.1m/crypto/des/asm/dest4-sparcv9.pl 1970-01-01 
01:00:00.000000000 +0100
++++ openssl-1.0.1m/crypto/des/asm/dest4-sparcv9.pl 2015-06-02 
09:52:11.809148396 +0200
+@@ -0,0 +1,602 @@
++#!/usr/bin/env perl
++
++# ====================================================================
++# Written by David S. Miller <[email protected]> and Andy Polyakov
++# <[email protected]>. The module is licensed under 2-clause BSD
++# license. March 2013. All rights reserved.
++# ====================================================================
++
++######################################################################
++# DES for SPARC T4.
++#
++# As with other hardware-assisted ciphers CBC encrypt results [for
++# aligned data] are virtually identical to critical path lengths:
++#
++#             DES             Triple-DES
++# CBC encrypt 4.14/4.15(*)    11.7/11.7
++# CBC decrypt 1.77/4.11(**)   6.42/7.47
++#
++#                      (*)    numbers after slash are for
++#                             misaligned data;
++#                      (**)   this is result for largest
++#                             block size, unlike all other
++#                             cases smaller blocks results
++#                             are better[?];
++
++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
++push(@INC,"${dir}","${dir}../../perlasm");
++require "sparcv9_modes.pl";
++
++&asm_init(@ARGV);
++
++$code.=<<___ if ($::abibits==64);
++.register       %g2,#scratch
++.register       %g3,#scratch
++___
++
++$code.=<<___;
++.text
++___
++
++{ my ($inp,$out)=("%o0","%o1");
++
++$code.=<<___;
++.align        32
++.globl        des_t4_key_expand
++.type des_t4_key_expand,#function
++des_t4_key_expand:
++      andcc           $inp, 0x7, %g0
++      alignaddr       $inp, %g0, $inp
++      bz,pt           %icc, 1f
++      ldd             [$inp + 0x00], %f0
++      ldd             [$inp + 0x08], %f2
++      faligndata      %f0, %f2, %f0
++1:    des_kexpand     %f0, 0, %f0
++      des_kexpand     %f0, 1, %f2
++      std             %f0, [$out + 0x00]
++      des_kexpand     %f2, 3, %f6
++      std             %f2, [$out + 0x08]
++      des_kexpand     %f2, 2, %f4
++      des_kexpand     %f6, 3, %f10
++      std             %f6, [$out + 0x18]
++      des_kexpand     %f6, 2, %f8
++      std             %f4, [$out + 0x10]
++      des_kexpand     %f10, 3, %f14
++      std             %f10, [$out + 0x28]
++      des_kexpand     %f10, 2, %f12
++      std             %f8, [$out + 0x20]
++      des_kexpand     %f14, 1, %f16
++      std             %f14, [$out + 0x38]
++      des_kexpand     %f16, 3, %f20
++      std             %f12, [$out + 0x30]
++      des_kexpand     %f16, 2, %f18
++      std             %f16, [$out + 0x40]
++      des_kexpand     %f20, 3, %f24
++      std             %f20, [$out + 0x50]
++      des_kexpand     %f20, 2, %f22
++      std             %f18, [$out + 0x48]
++      des_kexpand     %f24, 3, %f28
++      std             %f24, [$out + 0x60]
++      des_kexpand     %f24, 2, %f26
++      std             %f22, [$out + 0x58]
++      des_kexpand     %f28, 1, %f30
++      std             %f28, [$out + 0x70]
++      std             %f26, [$out + 0x68]
++      retl
++      std             %f30, [$out + 0x78]
++.size des_t4_key_expand,.-des_t4_key_expand
++___
++}
++{ my ($inp,$out,$len,$key,$ivec) = map("%o$_",(0..4));
++  my ($ileft,$iright,$omask) = map("%g$_",(1..3));
++
++$code.=<<___;
++.globl        des_t4_cbc_encrypt
++.align        32
++des_t4_cbc_encrypt:
++      ld              [$ivec + 0], %f0        ! load ivec
++      ld              [$ivec + 4], %f1
++
++      and             $inp, 7, $ileft
++      andn            $inp, 7, $inp
++      sll             $ileft, 3, $ileft
++      mov             0xff, $omask
++      prefetch        [$inp], 20
++      prefetch        [$inp + 63], 20
++      sub             %g0, $ileft, $iright
++      and             $out, 7, %g4
++      alignaddrl      $out, %g0, $out
++      srl             $omask, %g4, $omask
++      srlx            $len, 3, $len
++      movrz           %g4, 0, $omask
++      prefetch        [$out], 22
++
++      ldd             [$key + 0x00], %f4      ! load key schedule
++      ldd             [$key + 0x08], %f6
++      ldd             [$key + 0x10], %f8
++      ldd             [$key + 0x18], %f10
++      ldd             [$key + 0x20], %f12
++      ldd             [$key + 0x28], %f14
++      ldd             [$key + 0x30], %f16
++      ldd             [$key + 0x38], %f18
++      ldd             [$key + 0x40], %f20
++      ldd             [$key + 0x48], %f22
++      ldd             [$key + 0x50], %f24
++      ldd             [$key + 0x58], %f26
++      ldd             [$key + 0x60], %f28
++      ldd             [$key + 0x68], %f30
++      ldd             [$key + 0x70], %f32
++      ldd             [$key + 0x78], %f34
++
++.Ldes_cbc_enc_loop:
++      ldx             [$inp + 0], %g4
++      brz,pt          $ileft, 4f
++      nop
++
++      ldx             [$inp + 8], %g5
++      sllx            %g4, $ileft, %g4
++      srlx            %g5, $iright, %g5
++      or              %g5, %g4, %g4
++4:
++      movxtod         %g4, %f2
++      prefetch        [$inp + 8+63], 20
++      add             $inp, 8, $inp
++      fxor            %f2, %f0, %f0           ! ^= ivec
++      prefetch        [$out + 63], 22
++
++      des_ip          %f0, %f0
++      des_round       %f4, %f6, %f0, %f0
++      des_round       %f8, %f10, %f0, %f0
++      des_round       %f12, %f14, %f0, %f0
++      des_round       %f16, %f18, %f0, %f0
++      des_round       %f20, %f22, %f0, %f0
++      des_round       %f24, %f26, %f0, %f0
++      des_round       %f28, %f30, %f0, %f0
++      des_round       %f32, %f34, %f0, %f0
++      des_iip         %f0, %f0
++
++      brnz,pn         $omask, 2f
++      sub             $len, 1, $len
++
++      std             %f0, [$out + 0]
++      brnz,pt         $len, .Ldes_cbc_enc_loop
++      add             $out, 8, $out
++
++      st              %f0, [$ivec + 0]        ! write out ivec
++      retl
++      st              %f1, [$ivec + 4]
++
++.align        16
++2:    ldxa            [$inp]0x82, %g4         ! avoid read-after-write hazard
++                                              ! and ~4x deterioration
++                                              ! in inp==out case
++      faligndata      %f0, %f0, %f2           ! handle unaligned output
++
++      stda            %f2, [$out + $omask]0xc0        ! partial store
++      add             $out, 8, $out
++      orn             %g0, $omask, $omask
++      stda            %f2, [$out + $omask]0xc0        ! partial store
++
++      brnz,pt         $len, .Ldes_cbc_enc_loop+4
++      orn             %g0, $omask, $omask
++
++      st              %f0, [$ivec + 0]        ! write out ivec
++      retl
++      st              %f1, [$ivec + 4]
++.type des_t4_cbc_encrypt,#function
++.size des_t4_cbc_encrypt,.-des_t4_cbc_encrypt
++
++.globl        des_t4_cbc_decrypt
++.align        32
++des_t4_cbc_decrypt:
++      ld              [$ivec + 0], %f2        ! load ivec
++      ld              [$ivec + 4], %f3
++
++      and             $inp, 7, $ileft
++      andn            $inp, 7, $inp
++      sll             $ileft, 3, $ileft
++      mov             0xff, $omask
++      prefetch        [$inp], 20
++      prefetch        [$inp + 63], 20
++      sub             %g0, $ileft, $iright
++      and             $out, 7, %g4
++      alignaddrl      $out, %g0, $out
++      srl             $omask, %g4, $omask
++      srlx            $len, 3, $len
++      movrz           %g4, 0, $omask
++      prefetch        [$out], 22
++
++      ldd             [$key + 0x78], %f4      ! load key schedule
++      ldd             [$key + 0x70], %f6
++      ldd             [$key + 0x68], %f8
++      ldd             [$key + 0x60], %f10
++      ldd             [$key + 0x58], %f12
++      ldd             [$key + 0x50], %f14
++      ldd             [$key + 0x48], %f16
++      ldd             [$key + 0x40], %f18
++      ldd             [$key + 0x38], %f20
++      ldd             [$key + 0x30], %f22
++      ldd             [$key + 0x28], %f24
++      ldd             [$key + 0x20], %f26
++      ldd             [$key + 0x18], %f28
++      ldd             [$key + 0x10], %f30
++      ldd             [$key + 0x08], %f32
++      ldd             [$key + 0x00], %f34
++
++.Ldes_cbc_dec_loop:
++      ldx             [$inp + 0], %g4
++      brz,pt          $ileft, 4f
++      nop
++
++      ldx             [$inp + 8], %g5
++      sllx            %g4, $ileft, %g4
++      srlx            %g5, $iright, %g5
++      or              %g5, %g4, %g4
++4:
++      movxtod         %g4, %f0
++      prefetch        [$inp + 8+63], 20
++      add             $inp, 8, $inp
++      prefetch        [$out + 63], 22
++
++      des_ip          %f0, %f0
++      des_round       %f4, %f6, %f0, %f0
++      des_round       %f8, %f10, %f0, %f0
++      des_round       %f12, %f14, %f0, %f0
++      des_round       %f16, %f18, %f0, %f0
++      des_round       %f20, %f22, %f0, %f0
++      des_round       %f24, %f26, %f0, %f0
++      des_round       %f28, %f30, %f0, %f0
++      des_round       %f32, %f34, %f0, %f0
++      des_iip         %f0, %f0
++
++      fxor            %f2, %f0, %f0           ! ^= ivec
++      movxtod         %g4, %f2
++
++      brnz,pn         $omask, 2f
++      sub             $len, 1, $len
++
++      std             %f0, [$out + 0]
++      brnz,pt         $len, .Ldes_cbc_dec_loop
++      add             $out, 8, $out
++
++      st              %f2, [$ivec + 0]        ! write out ivec
++      retl
++      st              %f3, [$ivec + 4]
++
++.align        16
++2:    ldxa            [$inp]0x82, %g4         ! avoid read-after-write hazard
++                                              ! and ~4x deterioration
++                                              ! in inp==out case
++      faligndata      %f0, %f0, %f0           ! handle unaligned output
++
++      stda            %f0, [$out + $omask]0xc0        ! partial store
++      add             $out, 8, $out
++      orn             %g0, $omask, $omask
++      stda            %f0, [$out + $omask]0xc0        ! partial store
++
++      brnz,pt         $len, .Ldes_cbc_dec_loop+4
++      orn             %g0, $omask, $omask
++
++      st              %f2, [$ivec + 0]        ! write out ivec
++      retl
++      st              %f3, [$ivec + 4]
++.type des_t4_cbc_decrypt,#function
++.size des_t4_cbc_decrypt,.-des_t4_cbc_decrypt
++___
++
++# One might wonder why does one have back-to-back des_iip/des_ip
++# pairs between EDE passes. Indeed, aren't they inverse of each other?
++# They almost are. Outcome of the pair is 32-bit words being swapped
++# in target register. Consider pair of des_iip/des_ip as a way to
++# perform the due swap, it's actually fastest way in this case.
++
++$code.=<<___;
++.globl        des_t4_ede3_cbc_encrypt
++.align        32
++des_t4_ede3_cbc_encrypt:
++      ld              [$ivec + 0], %f0        ! load ivec
++      ld              [$ivec + 4], %f1
++
++      and             $inp, 7, $ileft
++      andn            $inp, 7, $inp
++      sll             $ileft, 3, $ileft
++      mov             0xff, $omask
++      prefetch        [$inp], 20
++      prefetch        [$inp + 63], 20
++      sub             %g0, $ileft, $iright
++      and             $out, 7, %g4
++      alignaddrl      $out, %g0, $out
++      srl             $omask, %g4, $omask
++      srlx            $len, 3, $len
++      movrz           %g4, 0, $omask
++      prefetch        [$out], 22
++
++      ldd             [$key + 0x00], %f4      ! load key schedule
++      ldd             [$key + 0x08], %f6
++      ldd             [$key + 0x10], %f8
++      ldd             [$key + 0x18], %f10
++      ldd             [$key + 0x20], %f12
++      ldd             [$key + 0x28], %f14
++      ldd             [$key + 0x30], %f16
++      ldd             [$key + 0x38], %f18
++      ldd             [$key + 0x40], %f20
++      ldd             [$key + 0x48], %f22
++      ldd             [$key + 0x50], %f24
++      ldd             [$key + 0x58], %f26
++      ldd             [$key + 0x60], %f28
++      ldd             [$key + 0x68], %f30
++      ldd             [$key + 0x70], %f32
++      ldd             [$key + 0x78], %f34
++
++.Ldes_ede3_cbc_enc_loop:
++      ldx             [$inp + 0], %g4
++      brz,pt          $ileft, 4f
++      nop
++
++      ldx             [$inp + 8], %g5
++      sllx            %g4, $ileft, %g4
++      srlx            %g5, $iright, %g5
++      or              %g5, %g4, %g4
++4:
++      movxtod         %g4, %f2
++      prefetch        [$inp + 8+63], 20
++      add             $inp, 8, $inp
++      fxor            %f2, %f0, %f0           ! ^= ivec
++      prefetch        [$out + 63], 22
++
++      des_ip          %f0, %f0
++      des_round       %f4, %f6, %f0, %f0
++      des_round       %f8, %f10, %f0, %f0
++      des_round       %f12, %f14, %f0, %f0
++      des_round       %f16, %f18, %f0, %f0
++      ldd             [$key + 0x100-0x08], %f36
++      ldd             [$key + 0x100-0x10], %f38
++      des_round       %f20, %f22, %f0, %f0
++      ldd             [$key + 0x100-0x18], %f40
++      ldd             [$key + 0x100-0x20], %f42
++      des_round       %f24, %f26, %f0, %f0
++      ldd             [$key + 0x100-0x28], %f44
++      ldd             [$key + 0x100-0x30], %f46
++      des_round       %f28, %f30, %f0, %f0
++      ldd             [$key + 0x100-0x38], %f48
++      ldd             [$key + 0x100-0x40], %f50
++      des_round       %f32, %f34, %f0, %f0
++      ldd             [$key + 0x100-0x48], %f52
++      ldd             [$key + 0x100-0x50], %f54
++      des_iip         %f0, %f0
++
++      ldd             [$key + 0x100-0x58], %f56
++      ldd             [$key + 0x100-0x60], %f58
++      des_ip          %f0, %f0
++      ldd             [$key + 0x100-0x68], %f60
++      ldd             [$key + 0x100-0x70], %f62
++      des_round       %f36, %f38, %f0, %f0
++      ldd             [$key + 0x100-0x78], %f36
++      ldd             [$key + 0x100-0x80], %f38
++      des_round       %f40, %f42, %f0, %f0
++      des_round       %f44, %f46, %f0, %f0
++      des_round       %f48, %f50, %f0, %f0
++      ldd             [$key + 0x100+0x00], %f40
++      ldd             [$key + 0x100+0x08], %f42
++      des_round       %f52, %f54, %f0, %f0
++      ldd             [$key + 0x100+0x10], %f44
++      ldd             [$key + 0x100+0x18], %f46
++      des_round       %f56, %f58, %f0, %f0
++      ldd             [$key + 0x100+0x20], %f48
++      ldd             [$key + 0x100+0x28], %f50
++      des_round       %f60, %f62, %f0, %f0
++      ldd             [$key + 0x100+0x30], %f52
++      ldd             [$key + 0x100+0x38], %f54
++      des_round       %f36, %f38, %f0, %f0
++      ldd             [$key + 0x100+0x40], %f56
++      ldd             [$key + 0x100+0x48], %f58
++      des_iip         %f0, %f0
++
++      ldd             [$key + 0x100+0x50], %f60
++      ldd             [$key + 0x100+0x58], %f62
++      des_ip          %f0, %f0
++      ldd             [$key + 0x100+0x60], %f36
++      ldd             [$key + 0x100+0x68], %f38
++      des_round       %f40, %f42, %f0, %f0
++      ldd             [$key + 0x100+0x70], %f40
++      ldd             [$key + 0x100+0x78], %f42
++      des_round       %f44, %f46, %f0, %f0
++      des_round       %f48, %f50, %f0, %f0
++      des_round       %f52, %f54, %f0, %f0
++      des_round       %f56, %f58, %f0, %f0
++      des_round       %f60, %f62, %f0, %f0
++      des_round       %f36, %f38, %f0, %f0
++      des_round       %f40, %f42, %f0, %f0
++      des_iip         %f0, %f0
++
++      brnz,pn         $omask, 2f
++      sub             $len, 1, $len
++
++      std             %f0, [$out + 0]
++      brnz,pt         $len, .Ldes_ede3_cbc_enc_loop
++      add             $out, 8, $out
++
++      st              %f0, [$ivec + 0]        ! write out ivec
++      retl
++      st              %f1, [$ivec + 4]
++
++.align        16
++2:    ldxa            [$inp]0x82, %g4         ! avoid read-after-write hazard
++                                              ! and ~2x deterioration
++                                              ! in inp==out case
++      faligndata      %f0, %f0, %f2           ! handle unaligned output
++
++      stda            %f2, [$out + $omask]0xc0        ! partial store
++      add             $out, 8, $out
++      orn             %g0, $omask, $omask
++      stda            %f2, [$out + $omask]0xc0        ! partial store
++
++      brnz,pt         $len, .Ldes_ede3_cbc_enc_loop+4
++      orn             %g0, $omask, $omask
++
++      st              %f0, [$ivec + 0]        ! write out ivec
++      retl
++      st              %f1, [$ivec + 4]
++.type des_t4_ede3_cbc_encrypt,#function
++.size des_t4_ede3_cbc_encrypt,.-des_t4_ede3_cbc_encrypt
++
++.globl        des_t4_ede3_cbc_decrypt
++.align        32
++des_t4_ede3_cbc_decrypt:
++      ld              [$ivec + 0], %f2        ! load ivec
++      ld              [$ivec + 4], %f3
++
++      and             $inp, 7, $ileft
++      andn            $inp, 7, $inp
++      sll             $ileft, 3, $ileft
++      mov             0xff, $omask
++      prefetch        [$inp], 20
++      prefetch        [$inp + 63], 20
++      sub             %g0, $ileft, $iright
++      and             $out, 7, %g4
++      alignaddrl      $out, %g0, $out
++      srl             $omask, %g4, $omask
++      srlx            $len, 3, $len
++      movrz           %g4, 0, $omask
++      prefetch        [$out], 22
++
++      ldd             [$key + 0x100+0x78], %f4        ! load key schedule
++      ldd             [$key + 0x100+0x70], %f6
++      ldd             [$key + 0x100+0x68], %f8
++      ldd             [$key + 0x100+0x60], %f10
++      ldd             [$key + 0x100+0x58], %f12
++      ldd             [$key + 0x100+0x50], %f14
++      ldd             [$key + 0x100+0x48], %f16
++      ldd             [$key + 0x100+0x40], %f18
++      ldd             [$key + 0x100+0x38], %f20
++      ldd             [$key + 0x100+0x30], %f22
++      ldd             [$key + 0x100+0x28], %f24
++      ldd             [$key + 0x100+0x20], %f26
++      ldd             [$key + 0x100+0x18], %f28
++      ldd             [$key + 0x100+0x10], %f30
++      ldd             [$key + 0x100+0x08], %f32
++      ldd             [$key + 0x100+0x00], %f34
++
++.Ldes_ede3_cbc_dec_loop:
++      ldx             [$inp + 0], %g4
++      brz,pt          $ileft, 4f
++      nop
++
++      ldx             [$inp + 8], %g5
++      sllx            %g4, $ileft, %g4
++      srlx            %g5, $iright, %g5
++      or              %g5, %g4, %g4
++4:
++      movxtod         %g4, %f0
++      prefetch        [$inp + 8+63], 20
++      add             $inp, 8, $inp
++      prefetch        [$out + 63], 22
++
++      des_ip          %f0, %f0
++      des_round       %f4, %f6, %f0, %f0
++      des_round       %f8, %f10, %f0, %f0
++      des_round       %f12, %f14, %f0, %f0
++      des_round       %f16, %f18, %f0, %f0
++      ldd             [$key + 0x80+0x00], %f36
++      ldd             [$key + 0x80+0x08], %f38
++      des_round       %f20, %f22, %f0, %f0
++      ldd             [$key + 0x80+0x10], %f40
++      ldd             [$key + 0x80+0x18], %f42
++      des_round       %f24, %f26, %f0, %f0
++      ldd             [$key + 0x80+0x20], %f44
++      ldd             [$key + 0x80+0x28], %f46
++      des_round       %f28, %f30, %f0, %f0
++      ldd             [$key + 0x80+0x30], %f48
++      ldd             [$key + 0x80+0x38], %f50
++      des_round       %f32, %f34, %f0, %f0
++      ldd             [$key + 0x80+0x40], %f52
++      ldd             [$key + 0x80+0x48], %f54
++      des_iip         %f0, %f0
++
++      ldd             [$key + 0x80+0x50], %f56
++      ldd             [$key + 0x80+0x58], %f58
++      des_ip          %f0, %f0
++      ldd             [$key + 0x80+0x60], %f60
++      ldd             [$key + 0x80+0x68], %f62
++      des_round       %f36, %f38, %f0, %f0
++      ldd             [$key + 0x80+0x70], %f36
++      ldd             [$key + 0x80+0x78], %f38
++      des_round       %f40, %f42, %f0, %f0
++      des_round       %f44, %f46, %f0, %f0
++      des_round       %f48, %f50, %f0, %f0
++      ldd             [$key + 0x80-0x08], %f40
++      ldd             [$key + 0x80-0x10], %f42
++      des_round       %f52, %f54, %f0, %f0
++      ldd             [$key + 0x80-0x18], %f44
++      ldd             [$key + 0x80-0x20], %f46
++      des_round       %f56, %f58, %f0, %f0
++      ldd             [$key + 0x80-0x28], %f48
++      ldd             [$key + 0x80-0x30], %f50
++      des_round       %f60, %f62, %f0, %f0
++      ldd             [$key + 0x80-0x38], %f52
++      ldd             [$key + 0x80-0x40], %f54
++      des_round       %f36, %f38, %f0, %f0
++      ldd             [$key + 0x80-0x48], %f56
++      ldd             [$key + 0x80-0x50], %f58
++      des_iip         %f0, %f0
++
++      ldd             [$key + 0x80-0x58], %f60
++      ldd             [$key + 0x80-0x60], %f62
++      des_ip          %f0, %f0
++      ldd             [$key + 0x80-0x68], %f36
++      ldd             [$key + 0x80-0x70], %f38
++      des_round       %f40, %f42, %f0, %f0
++      ldd             [$key + 0x80-0x78], %f40
++      ldd             [$key + 0x80-0x80], %f42
++      des_round       %f44, %f46, %f0, %f0
++      des_round       %f48, %f50, %f0, %f0
++      des_round       %f52, %f54, %f0, %f0
++      des_round       %f56, %f58, %f0, %f0
++      des_round       %f60, %f62, %f0, %f0
++      des_round       %f36, %f38, %f0, %f0
++      des_round       %f40, %f42, %f0, %f0
++      des_iip         %f0, %f0
++
++      fxor            %f2, %f0, %f0           ! ^= ivec
++      movxtod         %g4, %f2
++
++      brnz,pn         $omask, 2f
++      sub             $len, 1, $len
++
++      std             %f0, [$out + 0]
++      brnz,pt         $len, .Ldes_ede3_cbc_dec_loop
++      add             $out, 8, $out
++
++      st              %f2, [$ivec + 0]        ! write out ivec
++      retl
++      st              %f3, [$ivec + 4]
++
++.align        16
++2:    ldxa            [$inp]0x82, %g4         ! avoid read-after-write hazard
++                                              ! and ~3x deterioration
++                                              ! in inp==out case
++      faligndata      %f0, %f0, %f0           ! handle unaligned output
++
++      stda            %f0, [$out + $omask]0xc0        ! partial store
++      add             $out, 8, $out
++      orn             %g0, $omask, $omask
++      stda            %f0, [$out + $omask]0xc0        ! partial store
++
++      brnz,pt         $len, .Ldes_ede3_cbc_dec_loop+4
++      orn             %g0, $omask, $omask
++
++      st              %f2, [$ivec + 0]        ! write out ivec
++      retl
++      st              %f3, [$ivec + 4]
++.type des_t4_ede3_cbc_decrypt,#function
++.size des_t4_ede3_cbc_decrypt,.-des_t4_ede3_cbc_decrypt
++___
++}
++$code.=<<___;
++.asciz  "DES for SPARC T4, David S. Miller, Andy Polyakov"
++.align  4
++___
++
++&emit_assembler();
++
++close STDOUT;
+Index: crypto/perlasm/sparcv9_modes.pl
+===================================================================
+diff -uNr openssl-1.0.1m/crypto/perlasm/sparcv9_modes.pl 
openssl-1.0.1m/crypto/perlasm/sparcv9_modes.pl
+--- openssl-1.0.1m/crypto/perlasm/sparcv9_modes.pl 1970-01-01 
01:00:00.000000000 +0100
++++ openssl-1.0.1m/crypto/perlasm/sparcv9_modes.pl 2015-06-02 
09:52:11.809148396 +0200
+@@ -0,0 +1,1680 @@
++#!/usr/bin/env perl
++
++# Specific modes implementations for SPARC Architecture 2011. There
++# is T4 dependency though, an ASI value that is not specified in the
++# Architecture Manual. But as SPARC universe is rather monocultural,
++# we imply that processor capable of executing crypto instructions
++# can handle the ASI in question as well. This means that we ought to
++# keep eyes open when new processors emerge...
++#
++# As for above mentioned ASI. It's so called "block initializing
++# store" which cancels "read" in "read-update-write" on cache lines.
++# This is "cooperative" optimization, as it reduces overall pressure
++# on memory interface. Benefits can't be observed/quantified with
++# usual benchmarks, on the contrary you can notice that single-thread
++# performance for parallelizable modes is ~1.5% worse for largest
++# block sizes [though few percent better for not so long ones]. All
++# this based on suggestions from David Miller.
++
++sub asm_init {                # to be called with @ARGV as argument
++    for (@_)          { $::abibits=64 if (/\-m64/ || /\-xarch\=v9/); }
++    if ($::abibits==64)       { $::bias=2047; $::frame=192; 
$::size_t_cc="%xcc"; }
++    else              { $::bias=0;    $::frame=112; $::size_t_cc="%icc"; }
++}
++
++# unified interface
++my ($inp,$out,$len,$key,$ivec)=map("%i$_",(0..5));
++# local variables
++my ($ileft,$iright,$ooff,$omask,$ivoff,$blk_init)=map("%l$_",(0..7));
++
++sub alg_cbc_encrypt_implement {
++my ($alg,$bits) = @_;
++
++$::code.=<<___;
++.globl        ${alg}${bits}_t4_cbc_encrypt
++.align        32
++${alg}${bits}_t4_cbc_encrypt:
++      save            %sp, -$::frame, %sp
++      sub             $inp, $out, $blk_init   ! $inp!=$out
++___
++$::code.=<<___ if (!$::evp);
++      andcc           $ivec, 7, $ivoff
++      alignaddr       $ivec, %g0, $ivec
++
++      ldd             [$ivec + 0], %f0        ! load ivec
++      bz,pt           %icc, 1f
++      ldd             [$ivec + 8], %f2
++      ldd             [$ivec + 16], %f4
++      faligndata      %f0, %f2, %f0
++      faligndata      %f2, %f4, %f2
++1:
++___
++$::code.=<<___ if ($::evp);
++      ld              [$ivec + 0], %f0
++      ld              [$ivec + 4], %f1
++      ld              [$ivec + 8], %f2
++      ld              [$ivec + 12], %f3
++___
++$::code.=<<___;
++      prefetch        [$inp], 20
++      prefetch        [$inp + 63], 20
++      call            _${alg}${bits}_load_enckey
++      and             $inp, 7, $ileft
++      andn            $inp, 7, $inp
++      sll             $ileft, 3, $ileft
++      mov             64, $iright
++      mov             0xff, $omask
++      sub             $iright, $ileft, $iright
++      and             $out, 7, $ooff
++      cmp             $len, 127
++      movrnz          $ooff, 0, $blk_init             ! if (  $out&7 ||
++      movleu          $::size_t_cc, 0, $blk_init      !       $len<128 ||
++      brnz,pn         $blk_init, .L${bits}cbc_enc_blk !       $inp==$out)
++      srl             $omask, $ooff, $omask
++
++      alignaddrl      $out, %g0, $out
++      srlx            $len, 4, $len
++      prefetch        [$out], 22
++
++.L${bits}_cbc_enc_loop:
++      ldx             [$inp + 0], %o0
++      brz,pt          $ileft, 4f
++      ldx             [$inp + 8], %o1
++
++      ldx             [$inp + 16], %o2
++      sllx            %o0, $ileft, %o0
++      srlx            %o1, $iright, %g1
++      sllx            %o1, $ileft, %o1
++      or              %g1, %o0, %o0
++      srlx            %o2, $iright, %o2
++      or              %o2, %o1, %o1
++4:
++      xor             %g4, %o0, %o0           ! ^= rk[0]
++      xor             %g5, %o1, %o1
++      movxtod         %o0, %f12
++      movxtod         %o1, %f14
++
++      fxor            %f12, %f0, %f0          ! ^= ivec
++      fxor            %f14, %f2, %f2
++      prefetch        [$out + 63], 22
++      prefetch        [$inp + 16+63], 20
++      call            _${alg}${bits}_encrypt_1x
++      add             $inp, 16, $inp
++
++      brnz,pn         $ooff, 2f
++      sub             $len, 1, $len
++              
++      std             %f0, [$out + 0]
++      std             %f2, [$out + 8]
++      brnz,pt         $len, .L${bits}_cbc_enc_loop
++      add             $out, 16, $out
++___
++$::code.=<<___ if ($::evp);
++      st              %f0, [$ivec + 0]
++      st              %f1, [$ivec + 4]
++      st              %f2, [$ivec + 8]
++      st              %f3, [$ivec + 12]
++___
++$::code.=<<___ if (!$::evp);
++      brnz,pn         $ivoff, 3f
++      nop
++
++      std             %f0, [$ivec + 0]        ! write out ivec
++      std             %f2, [$ivec + 8]
++___
++$::code.=<<___;
++      ret
++      restore
++
++.align        16
++2:    ldxa            [$inp]0x82, %o0         ! avoid read-after-write hazard
++                                              ! and ~3x deterioration
++                                              ! in inp==out case
++      faligndata      %f0, %f0, %f4           ! handle unaligned output
++      faligndata      %f0, %f2, %f6
++      faligndata      %f2, %f2, %f8
++
++      stda            %f4, [$out + $omask]0xc0        ! partial store
++      std             %f6, [$out + 8]
++      add             $out, 16, $out
++      orn             %g0, $omask, $omask
++      stda            %f8, [$out + $omask]0xc0        ! partial store
++
++      brnz,pt         $len, .L${bits}_cbc_enc_loop+4
++      orn             %g0, $omask, $omask
++___
++$::code.=<<___ if ($::evp);
++      st              %f0, [$ivec + 0]
++      st              %f1, [$ivec + 4]
++      st              %f2, [$ivec + 8]
++      st              %f3, [$ivec + 12]
++___
++$::code.=<<___ if (!$::evp);
++      brnz,pn         $ivoff, 3f
++      nop
++
++      std             %f0, [$ivec + 0]        ! write out ivec
++      std             %f2, [$ivec + 8]
++      ret
++      restore
++
++.align        16
++3:    alignaddrl      $ivec, $ivoff, %g0      ! handle unaligned ivec
++      mov             0xff, $omask
++      srl             $omask, $ivoff, $omask
++      faligndata      %f0, %f0, %f4
++      faligndata      %f0, %f2, %f6
++      faligndata      %f2, %f2, %f8
++      stda            %f4, [$ivec + $omask]0xc0
++      std             %f6, [$ivec + 8]
++      add             $ivec, 16, $ivec
++      orn             %g0, $omask, $omask
++      stda            %f8, [$ivec + $omask]0xc0
++___
++$::code.=<<___;
++      ret
++      restore
++
++!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
++.align        32
++.L${bits}cbc_enc_blk:
++      add     $out, $len, $blk_init
++      and     $blk_init, 63, $blk_init        ! tail
++      sub     $len, $blk_init, $len
++      add     $blk_init, 15, $blk_init        ! round up to 16n
++      srlx    $len, 4, $len
++      srl     $blk_init, 4, $blk_init
++
++.L${bits}_cbc_enc_blk_loop:
++      ldx             [$inp + 0], %o0
++      brz,pt          $ileft, 5f
++      ldx             [$inp + 8], %o1
++
++      ldx             [$inp + 16], %o2
++      sllx            %o0, $ileft, %o0
++      srlx            %o1, $iright, %g1
++      sllx            %o1, $ileft, %o1
++      or              %g1, %o0, %o0
++      srlx            %o2, $iright, %o2
++      or              %o2, %o1, %o1
++5:
++      xor             %g4, %o0, %o0           ! ^= rk[0]
++      xor             %g5, %o1, %o1
++      movxtod         %o0, %f12
++      movxtod         %o1, %f14
++
++      fxor            %f12, %f0, %f0          ! ^= ivec
++      fxor            %f14, %f2, %f2
++      prefetch        [$inp + 16+63], 20
++      call            _${alg}${bits}_encrypt_1x
++      add             $inp, 16, $inp
++      sub             $len, 1, $len
++              
++      stda            %f0, [$out]0xe2         ! ASI_BLK_INIT, T4-specific
++      add             $out, 8, $out
++      stda            %f2, [$out]0xe2         ! ASI_BLK_INIT, T4-specific
++      brnz,pt         $len, .L${bits}_cbc_enc_blk_loop
++      add             $out, 8, $out
++
++      membar          #StoreLoad|#StoreStore
++      brnz,pt         $blk_init, .L${bits}_cbc_enc_loop
++      mov             $blk_init, $len
++___
++$::code.=<<___ if ($::evp);
++      st              %f0, [$ivec + 0]
++      st              %f1, [$ivec + 4]
++      st              %f2, [$ivec + 8]
++      st              %f3, [$ivec + 12]
++___
++$::code.=<<___ if (!$::evp);
++      brnz,pn         $ivoff, 3b
++      nop
++
++      std             %f0, [$ivec + 0]        ! write out ivec
++      std             %f2, [$ivec + 8]
++___
++$::code.=<<___;
++      ret
++      restore
++.type ${alg}${bits}_t4_cbc_encrypt,#function
++.size ${alg}${bits}_t4_cbc_encrypt,.-${alg}${bits}_t4_cbc_encrypt
++___
++}
++
++sub alg_cbc_decrypt_implement {
++my ($alg,$bits) = @_;
++
++$::code.=<<___;
++.globl        ${alg}${bits}_t4_cbc_decrypt
++.align        32
++${alg}${bits}_t4_cbc_decrypt:
++      save            %sp, -$::frame, %sp
++      sub             $inp, $out, $blk_init   ! $inp!=$out
++___
++$::code.=<<___ if (!$::evp);
++      andcc           $ivec, 7, $ivoff
++      alignaddr       $ivec, %g0, $ivec
++
++      ldd             [$ivec + 0], %f12       ! load ivec
++      bz,pt           %icc, 1f
++      ldd             [$ivec + 8], %f14
++      ldd             [$ivec + 16], %f0
++      faligndata      %f12, %f14, %f12
++      faligndata      %f14, %f0, %f14
++1:
++___
++$::code.=<<___ if ($::evp);
++      ld              [$ivec + 0], %f12       ! load ivec
++      ld              [$ivec + 4], %f13
++      ld              [$ivec + 8], %f14
++      ld              [$ivec + 12], %f15
++___
++$::code.=<<___;
++      prefetch        [$inp], 20
++      prefetch        [$inp + 63], 20
++      call            _${alg}${bits}_load_deckey
++      and             $inp, 7, $ileft
++      andn            $inp, 7, $inp
++      sll             $ileft, 3, $ileft
++      mov             64, $iright
++      mov             0xff, $omask
++      sub             $iright, $ileft, $iright
++      and             $out, 7, $ooff
++      cmp             $len, 255
++      movrnz          $ooff, 0, $blk_init             ! if (  $out&7 ||
++      movleu          $::size_t_cc, 0, $blk_init      !       $len<256 ||
++      brnz,pn         $blk_init, .L${bits}cbc_dec_blk !       $inp==$out)
++      srl             $omask, $ooff, $omask
++
++      andcc           $len, 16, %g0           ! is number of blocks even?
++      srlx            $len, 4, $len
++      alignaddrl      $out, %g0, $out
++      bz              %icc, .L${bits}_cbc_dec_loop2x
++      prefetch        [$out], 22
++.L${bits}_cbc_dec_loop:
++      ldx             [$inp + 0], %o0
++      brz,pt          $ileft, 4f
++      ldx             [$inp + 8], %o1
++
++      ldx             [$inp + 16], %o2
++      sllx            %o0, $ileft, %o0
++      srlx            %o1, $iright, %g1
++      sllx            %o1, $ileft, %o1
++      or              %g1, %o0, %o0
++      srlx            %o2, $iright, %o2
++      or              %o2, %o1, %o1
++4:
++      xor             %g4, %o0, %o2           ! ^= rk[0]
++      xor             %g5, %o1, %o3
++      movxtod         %o2, %f0
++      movxtod         %o3, %f2
++
++      prefetch        [$out + 63], 22
++      prefetch        [$inp + 16+63], 20
++      call            _${alg}${bits}_decrypt_1x
++      add             $inp, 16, $inp
++
++      fxor            %f12, %f0, %f0          ! ^= ivec
++      fxor            %f14, %f2, %f2
++      movxtod         %o0, %f12
++      movxtod         %o1, %f14
++
++      brnz,pn         $ooff, 2f
++      sub             $len, 1, $len
++              
++      std             %f0, [$out + 0]
++      std             %f2, [$out + 8]
++      brnz,pt         $len, .L${bits}_cbc_dec_loop2x
++      add             $out, 16, $out
++___
++$::code.=<<___ if ($::evp);
++      st              %f12, [$ivec + 0]
++      st              %f13, [$ivec + 4]
++      st              %f14, [$ivec + 8]
++      st              %f15, [$ivec + 12]
++___
++$::code.=<<___ if (!$::evp);
++      brnz,pn         $ivoff, .L${bits}_cbc_dec_unaligned_ivec
++      nop
++
++      std             %f12, [$ivec + 0]       ! write out ivec
++      std             %f14, [$ivec + 8]
++___
++$::code.=<<___;
++      ret
++      restore
++
++.align        16
++2:    ldxa            [$inp]0x82, %o0         ! avoid read-after-write hazard
++                                              ! and ~3x deterioration
++                                              ! in inp==out case
++      faligndata      %f0, %f0, %f4           ! handle unaligned output
++      faligndata      %f0, %f2, %f6
++      faligndata      %f2, %f2, %f8
++
++      stda            %f4, [$out + $omask]0xc0        ! partial store
++      std             %f6, [$out + 8]
++      add             $out, 16, $out
++      orn             %g0, $omask, $omask
++      stda            %f8, [$out + $omask]0xc0        ! partial store
++
++      brnz,pt         $len, .L${bits}_cbc_dec_loop2x+4
++      orn             %g0, $omask, $omask
++___
++$::code.=<<___ if ($::evp);
++      st              %f12, [$ivec + 0]
++      st              %f13, [$ivec + 4]
++      st              %f14, [$ivec + 8]
++      st              %f15, [$ivec + 12]
++___
++$::code.=<<___ if (!$::evp);
++      brnz,pn         $ivoff, .L${bits}_cbc_dec_unaligned_ivec
++      nop
++
++      std             %f12, [$ivec + 0]       ! write out ivec
++      std             %f14, [$ivec + 8]
++___
++$::code.=<<___;
++      ret
++      restore
++
++!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
++.align        32
++.L${bits}_cbc_dec_loop2x:
++      ldx             [$inp + 0], %o0
++      ldx             [$inp + 8], %o1
++      ldx             [$inp + 16], %o2
++      brz,pt          $ileft, 4f
++      ldx             [$inp + 24], %o3
++
++      ldx             [$inp + 32], %o4
++      sllx            %o0, $ileft, %o0
++      srlx            %o1, $iright, %g1
++      or              %g1, %o0, %o0
++      sllx            %o1, $ileft, %o1
++      srlx            %o2, $iright, %g1
++      or              %g1, %o1, %o1
++      sllx            %o2, $ileft, %o2
++      srlx            %o3, $iright, %g1
++      or              %g1, %o2, %o2
++      sllx            %o3, $ileft, %o3
++      srlx            %o4, $iright, %o4
++      or              %o4, %o3, %o3
++4:
++      xor             %g4, %o0, %o4           ! ^= rk[0]
++      xor             %g5, %o1, %o5
++      movxtod         %o4, %f0
++      movxtod         %o5, %f2
++      xor             %g4, %o2, %o4
++      xor             %g5, %o3, %o5
++      movxtod         %o4, %f4
++      movxtod         %o5, %f6
++
++      prefetch        [$out + 63], 22
++      prefetch        [$inp + 32+63], 20
++      call            _${alg}${bits}_decrypt_2x
++      add             $inp, 32, $inp
++
++      movxtod         %o0, %f8
++      movxtod         %o1, %f10
++      fxor            %f12, %f0, %f0          ! ^= ivec
++      fxor            %f14, %f2, %f2
++      movxtod         %o2, %f12
++      movxtod         %o3, %f14
++      fxor            %f8, %f4, %f4
++      fxor            %f10, %f6, %f6
++
++      brnz,pn         $ooff, 2f
++      sub             $len, 2, $len
++              
++      std             %f0, [$out + 0]
++      std             %f2, [$out + 8]
++      std             %f4, [$out + 16]
++      std             %f6, [$out + 24]
++      brnz,pt         $len, .L${bits}_cbc_dec_loop2x
++      add             $out, 32, $out
++___
++$::code.=<<___ if ($::evp);
++      st              %f12, [$ivec + 0]
++      st              %f13, [$ivec + 4]
++      st              %f14, [$ivec + 8]
++      st              %f15, [$ivec + 12]
++___
++$::code.=<<___ if (!$::evp);
++      brnz,pn         $ivoff, .L${bits}_cbc_dec_unaligned_ivec
++      nop
++
++      std             %f12, [$ivec + 0]       ! write out ivec
++      std             %f14, [$ivec + 8]
++___
++$::code.=<<___;
++      ret
++      restore
++
++.align        16
++2:    ldxa            [$inp]0x82, %o0         ! avoid read-after-write hazard
++                                              ! and ~3x deterioration
++                                              ! in inp==out case
++      faligndata      %f0, %f0, %f8           ! handle unaligned output
++      faligndata      %f0, %f2, %f0
++      faligndata      %f2, %f4, %f2
++      faligndata      %f4, %f6, %f4
++      faligndata      %f6, %f6, %f6
++      stda            %f8, [$out + $omask]0xc0        ! partial store
++      std             %f0, [$out + 8]
++      std             %f2, [$out + 16]
++      std             %f4, [$out + 24]
++      add             $out, 32, $out
++      orn             %g0, $omask, $omask
++      stda            %f6, [$out + $omask]0xc0        ! partial store
++
++      brnz,pt         $len, .L${bits}_cbc_dec_loop2x+4
++      orn             %g0, $omask, $omask
++___
++$::code.=<<___ if ($::evp);
++      st              %f12, [$ivec + 0]
++      st              %f13, [$ivec + 4]
++      st              %f14, [$ivec + 8]
++      st              %f15, [$ivec + 12]
++___
++$::code.=<<___ if (!$::evp);
++      brnz,pn         $ivoff, .L${bits}_cbc_dec_unaligned_ivec
++      nop
++
++      std             %f12, [$ivec + 0]       ! write out ivec
++      std             %f14, [$ivec + 8]
++      ret
++      restore
++
++.align        16
++.L${bits}_cbc_dec_unaligned_ivec:
++      alignaddrl      $ivec, $ivoff, %g0      ! handle unaligned ivec
++      mov             0xff, $omask
++      srl             $omask, $ivoff, $omask
++      faligndata      %f12, %f12, %f0
++      faligndata      %f12, %f14, %f2
++      faligndata      %f14, %f14, %f4
++      stda            %f0, [$ivec + $omask]0xc0
++      std             %f2, [$ivec + 8]
++      add             $ivec, 16, $ivec
++      orn             %g0, $omask, $omask
++      stda            %f4, [$ivec + $omask]0xc0
++___
++$::code.=<<___;
++      ret
++      restore
++
++!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
++.align        32
++.L${bits}cbc_dec_blk:
++      add     $out, $len, $blk_init
++      and     $blk_init, 63, $blk_init        ! tail
++      sub     $len, $blk_init, $len
++      add     $blk_init, 15, $blk_init        ! round up to 16n
++      srlx    $len, 4, $len
++      srl     $blk_init, 4, $blk_init
++      sub     $len, 1, $len
++      add     $blk_init, 1, $blk_init
++
++.L${bits}_cbc_dec_blk_loop2x:
++      ldx             [$inp + 0], %o0
++      ldx             [$inp + 8], %o1
++      ldx             [$inp + 16], %o2
++      brz,pt          $ileft, 5f
++      ldx             [$inp + 24], %o3
++
++      ldx             [$inp + 32], %o4
++      sllx            %o0, $ileft, %o0
++      srlx            %o1, $iright, %g1
++      or              %g1, %o0, %o0
++      sllx            %o1, $ileft, %o1
++      srlx            %o2, $iright, %g1
++      or              %g1, %o1, %o1
++      sllx            %o2, $ileft, %o2
++      srlx            %o3, $iright, %g1
++      or              %g1, %o2, %o2
++      sllx            %o3, $ileft, %o3
++      srlx            %o4, $iright, %o4
++      or              %o4, %o3, %o3
++5:
++      xor             %g4, %o0, %o4           ! ^= rk[0]
++      xor             %g5, %o1, %o5
++      movxtod         %o4, %f0
++      movxtod         %o5, %f2
++      xor             %g4, %o2, %o4
++      xor             %g5, %o3, %o5
++      movxtod         %o4, %f4
++      movxtod         %o5, %f6
++
++      prefetch        [$inp + 32+63], 20
++      call            _${alg}${bits}_decrypt_2x
++      add             $inp, 32, $inp
++      subcc           $len, 2, $len
++
++      movxtod         %o0, %f8
++      movxtod         %o1, %f10
++      fxor            %f12, %f0, %f0          ! ^= ivec
++      fxor            %f14, %f2, %f2
++      movxtod         %o2, %f12
++      movxtod         %o3, %f14
++      fxor            %f8, %f4, %f4
++      fxor            %f10, %f6, %f6
++
++      stda            %f0, [$out]0xe2         ! ASI_BLK_INIT, T4-specific
++      add             $out, 8, $out
++      stda            %f2, [$out]0xe2         ! ASI_BLK_INIT, T4-specific
++      add             $out, 8, $out
++      stda            %f4, [$out]0xe2         ! ASI_BLK_INIT, T4-specific
++      add             $out, 8, $out
++      stda            %f6, [$out]0xe2         ! ASI_BLK_INIT, T4-specific
++      bgu,pt          $::size_t_cc, .L${bits}_cbc_dec_blk_loop2x
++      add             $out, 8, $out
++
++      add             $blk_init, $len, $len
++      andcc           $len, 1, %g0            ! is number of blocks even?
++      membar          #StoreLoad|#StoreStore
++      bnz,pt          %icc, .L${bits}_cbc_dec_loop
++      srl             $len, 0, $len
++      brnz,pn         $len, .L${bits}_cbc_dec_loop2x
++      nop
++___
++$::code.=<<___ if ($::evp);
++      st              %f12, [$ivec + 0]       ! write out ivec
++      st              %f13, [$ivec + 4]
++      st              %f14, [$ivec + 8]
++      st              %f15, [$ivec + 12]
++___
++$::code.=<<___ if (!$::evp);
++      brnz,pn         $ivoff, 3b
++      nop
++
++      std             %f12, [$ivec + 0]       ! write out ivec
++      std             %f14, [$ivec + 8]
++___
++$::code.=<<___;
++      ret
++      restore
++.type ${alg}${bits}_t4_cbc_decrypt,#function
++.size ${alg}${bits}_t4_cbc_decrypt,.-${alg}${bits}_t4_cbc_decrypt
++___
++}
++
++sub alg_ctr32_implement {
++my ($alg,$bits) = @_;
++
++$::code.=<<___;
++.globl        ${alg}${bits}_t4_ctr32_encrypt
++.align        32
++${alg}${bits}_t4_ctr32_encrypt:
++      save            %sp, -$::frame, %sp
++
++      prefetch        [$inp], 20
++      prefetch        [$inp + 63], 20
++      call            _${alg}${bits}_load_enckey
++      sllx            $len, 4, $len
++
++      ld              [$ivec + 0], %l4        ! counter
++      ld              [$ivec + 4], %l5
++      ld              [$ivec + 8], %l6
++      ld              [$ivec + 12], %l7
++
++      sllx            %l4, 32, %o5
++      or              %l5, %o5, %o5
++      sllx            %l6, 32, %g1
++      xor             %o5, %g4, %g4           ! ^= rk[0]
++      xor             %g1, %g5, %g5
++      movxtod         %g4, %f14               ! most significant 64 bits
++
++      sub             $inp, $out, $blk_init   ! $inp!=$out
++      and             $inp, 7, $ileft
++      andn            $inp, 7, $inp
++      sll             $ileft, 3, $ileft
++      mov             64, $iright
++      mov             0xff, $omask
++      sub             $iright, $ileft, $iright
++      and             $out, 7, $ooff
++      cmp             $len, 255
++      movrnz          $ooff, 0, $blk_init             ! if (  $out&7 ||
++      movleu          $::size_t_cc, 0, $blk_init      !       $len<256 ||
++      brnz,pn         $blk_init, .L${bits}_ctr32_blk  !       $inp==$out)
++      srl             $omask, $ooff, $omask
++
++      andcc           $len, 16, %g0           ! is number of blocks even?
++      alignaddrl      $out, %g0, $out
++      bz              %icc, .L${bits}_ctr32_loop2x
++      srlx            $len, 4, $len
++.L${bits}_ctr32_loop:
++      ldx             [$inp + 0], %o0
++      brz,pt          $ileft, 4f
++      ldx             [$inp + 8], %o1
++
++      ldx             [$inp + 16], %o2
++      sllx            %o0, $ileft, %o0
++      srlx            %o1, $iright, %g1
++      sllx            %o1, $ileft, %o1
++      or              %g1, %o0, %o0
++      srlx            %o2, $iright, %o2
++      or              %o2, %o1, %o1
++4:
++      xor             %g5, %l7, %g1           ! ^= rk[0]
++      add             %l7, 1, %l7
++      movxtod         %g1, %f2
++      srl             %l7, 0, %l7             ! clruw
++      prefetch        [$out + 63], 22
++      prefetch        [$inp + 16+63], 20
++___
++$::code.=<<___ if ($alg eq "aes");
++      aes_eround01    %f16, %f14, %f2, %f4
++      aes_eround23    %f18, %f14, %f2, %f2
++___
++$::code.=<<___ if ($alg eq "cmll");
++      camellia_f      %f16, %f2, %f14, %f2
++      camellia_f      %f18, %f14, %f2, %f0
++___
++$::code.=<<___;
++      call            _${alg}${bits}_encrypt_1x+8
++      add             $inp, 16, $inp
++
++      movxtod         %o0, %f10
++      movxtod         %o1, %f12
++      fxor            %f10, %f0, %f0          ! ^= inp
++      fxor            %f12, %f2, %f2
++
++      brnz,pn         $ooff, 2f
++      sub             $len, 1, $len
++              
++      std             %f0, [$out + 0]
++      std             %f2, [$out + 8]
++      brnz,pt         $len, .L${bits}_ctr32_loop2x
++      add             $out, 16, $out
++
++      ret
++      restore
++
++.align        16
++2:    ldxa            [$inp]0x82, %o0         ! avoid read-after-write hazard
++                                              ! and ~3x deterioration
++                                              ! in inp==out case
++      faligndata      %f0, %f0, %f4           ! handle unaligned output
++      faligndata      %f0, %f2, %f6
++      faligndata      %f2, %f2, %f8
++      stda            %f4, [$out + $omask]0xc0        ! partial store
++      std             %f6, [$out + 8]
++      add             $out, 16, $out
++      orn             %g0, $omask, $omask
++      stda            %f8, [$out + $omask]0xc0        ! partial store
++
++      brnz,pt         $len, .L${bits}_ctr32_loop2x+4
++      orn             %g0, $omask, $omask
++
++      ret
++      restore
++
++!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
++.align        32
++.L${bits}_ctr32_loop2x:
++      ldx             [$inp + 0], %o0
++      ldx             [$inp + 8], %o1
++      ldx             [$inp + 16], %o2
++      brz,pt          $ileft, 4f
++      ldx             [$inp + 24], %o3
++
++      ldx             [$inp + 32], %o4
++      sllx            %o0, $ileft, %o0
++      srlx            %o1, $iright, %g1
++      or              %g1, %o0, %o0
++      sllx            %o1, $ileft, %o1
++      srlx            %o2, $iright, %g1
++      or              %g1, %o1, %o1
++      sllx            %o2, $ileft, %o2
++      srlx            %o3, $iright, %g1
++      or              %g1, %o2, %o2
++      sllx            %o3, $ileft, %o3
++      srlx            %o4, $iright, %o4
++      or              %o4, %o3, %o3
++4:
++      xor             %g5, %l7, %g1           ! ^= rk[0]
++      add             %l7, 1, %l7
++      movxtod         %g1, %f2
++      srl             %l7, 0, %l7             ! clruw
++      xor             %g5, %l7, %g1
++      add             %l7, 1, %l7
++      movxtod         %g1, %f6
++      srl             %l7, 0, %l7             ! clruw
++      prefetch        [$out + 63], 22
++      prefetch        [$inp + 32+63], 20
++___
++$::code.=<<___ if ($alg eq "aes");
++      aes_eround01    %f16, %f14, %f2, %f8
++      aes_eround23    %f18, %f14, %f2, %f2
++      aes_eround01    %f16, %f14, %f6, %f10
++      aes_eround23    %f18, %f14, %f6, %f6
++___
++$::code.=<<___ if ($alg eq "cmll");
++      camellia_f      %f16, %f2, %f14, %f2
++      camellia_f      %f16, %f6, %f14, %f6
++      camellia_f      %f18, %f14, %f2, %f0
++      camellia_f      %f18, %f14, %f6, %f4
++___
++$::code.=<<___;
++      call            _${alg}${bits}_encrypt_2x+16
++      add             $inp, 32, $inp
++
++      movxtod         %o0, %f8
++      movxtod         %o1, %f10
++      movxtod         %o2, %f12
++      fxor            %f8, %f0, %f0           ! ^= inp
++      movxtod         %o3, %f8
++      fxor            %f10, %f2, %f2
++      fxor            %f12, %f4, %f4
++      fxor            %f8, %f6, %f6
++
++      brnz,pn         $ooff, 2f
++      sub             $len, 2, $len
++              
++      std             %f0, [$out + 0]
++      std             %f2, [$out + 8]
++      std             %f4, [$out + 16]
++      std             %f6, [$out + 24]
++      brnz,pt         $len, .L${bits}_ctr32_loop2x
++      add             $out, 32, $out
++
++      ret
++      restore
++
++.align        16
++2:    ldxa            [$inp]0x82, %o0         ! avoid read-after-write hazard
++                                              ! and ~3x deterioration
++                                              ! in inp==out case
++      faligndata      %f0, %f0, %f8           ! handle unaligned output
++      faligndata      %f0, %f2, %f0
++      faligndata      %f2, %f4, %f2
++      faligndata      %f4, %f6, %f4
++      faligndata      %f6, %f6, %f6
++
++      stda            %f8, [$out + $omask]0xc0        ! partial store
++      std             %f0, [$out + 8]
++      std             %f2, [$out + 16]
++      std             %f4, [$out + 24]
++      add             $out, 32, $out
++      orn             %g0, $omask, $omask
++      stda            %f6, [$out + $omask]0xc0        ! partial store
++
++      brnz,pt         $len, .L${bits}_ctr32_loop2x+4
++      orn             %g0, $omask, $omask
++
++      ret
++      restore
++
++!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
++.align        32
++.L${bits}_ctr32_blk:
++      add     $out, $len, $blk_init
++      and     $blk_init, 63, $blk_init        ! tail
++      sub     $len, $blk_init, $len
++      add     $blk_init, 15, $blk_init        ! round up to 16n
++      srlx    $len, 4, $len
++      srl     $blk_init, 4, $blk_init
++      sub     $len, 1, $len
++      add     $blk_init, 1, $blk_init
++
++.L${bits}_ctr32_blk_loop2x:
++      ldx             [$inp + 0], %o0
++      ldx             [$inp + 8], %o1
++      ldx             [$inp + 16], %o2
++      brz,pt          $ileft, 5f
++      ldx             [$inp + 24], %o3
++
++      ldx             [$inp + 32], %o4
++      sllx            %o0, $ileft, %o0
++      srlx            %o1, $iright, %g1
++      or              %g1, %o0, %o0
++      sllx            %o1, $ileft, %o1
++      srlx            %o2, $iright, %g1
++      or              %g1, %o1, %o1
++      sllx            %o2, $ileft, %o2
++      srlx            %o3, $iright, %g1
++      or              %g1, %o2, %o2
++      sllx            %o3, $ileft, %o3
++      srlx            %o4, $iright, %o4
++      or              %o4, %o3, %o3
++5:
++      xor             %g5, %l7, %g1           ! ^= rk[0]
++      add             %l7, 1, %l7
++      movxtod         %g1, %f2
++      srl             %l7, 0, %l7             ! clruw
++      xor             %g5, %l7, %g1
++      add             %l7, 1, %l7
++      movxtod         %g1, %f6
++      srl             %l7, 0, %l7             ! clruw
++      prefetch        [$inp + 32+63], 20
++___
++$::code.=<<___ if ($alg eq "aes");
++      aes_eround01    %f16, %f14, %f2, %f8
++      aes_eround23    %f18, %f14, %f2, %f2
++      aes_eround01    %f16, %f14, %f6, %f10
++      aes_eround23    %f18, %f14, %f6, %f6
++___
++$::code.=<<___ if ($alg eq "cmll");
++      camellia_f      %f16, %f2, %f14, %f2
++      camellia_f      %f16, %f6, %f14, %f6
++      camellia_f      %f18, %f14, %f2, %f0
++      camellia_f      %f18, %f14, %f6, %f4
++___
++$::code.=<<___;
++      call            _${alg}${bits}_encrypt_2x+16
++      add             $inp, 32, $inp
++      subcc           $len, 2, $len
++
++      movxtod         %o0, %f8
++      movxtod         %o1, %f10
++      movxtod         %o2, %f12
++      fxor            %f8, %f0, %f0           ! ^= inp
++      movxtod         %o3, %f8
++      fxor            %f10, %f2, %f2
++      fxor            %f12, %f4, %f4
++      fxor            %f8, %f6, %f6
++
++      stda            %f0, [$out]0xe2         ! ASI_BLK_INIT, T4-specific
++      add             $out, 8, $out
++      stda            %f2, [$out]0xe2         ! ASI_BLK_INIT, T4-specific
++      add             $out, 8, $out
++      stda            %f4, [$out]0xe2         ! ASI_BLK_INIT, T4-specific
++      add             $out, 8, $out
++      stda            %f6, [$out]0xe2         ! ASI_BLK_INIT, T4-specific
++      bgu,pt          $::size_t_cc, .L${bits}_ctr32_blk_loop2x
++      add             $out, 8, $out
++
++      add             $blk_init, $len, $len
++      andcc           $len, 1, %g0            ! is number of blocks even?
++      membar          #StoreLoad|#StoreStore
++      bnz,pt          %icc, .L${bits}_ctr32_loop
++      srl             $len, 0, $len
++      brnz,pn         $len, .L${bits}_ctr32_loop2x
++      nop
++
++      ret
++      restore
++.type ${alg}${bits}_t4_ctr32_encrypt,#function
++.size ${alg}${bits}_t4_ctr32_encrypt,.-${alg}${bits}_t4_ctr32_encrypt
++___
++}
++
++sub alg_xts_implement {
++my ($alg,$bits,$dir) = @_;
++my ($inp,$out,$len,$key1,$key2,$ivec)=map("%i$_",(0..5));
++my $rem=$ivec;
++
++$::code.=<<___;
++.globl        ${alg}${bits}_t4_xts_${dir}crypt
++.align        32
++${alg}${bits}_t4_xts_${dir}crypt:
++      save            %sp, -$::frame-16, %sp
++
++      mov             $ivec, %o0
++      add             %fp, $::bias-16, %o1
++      call            ${alg}_t4_encrypt
++      mov             $key2, %o2
++
++      add             %fp, $::bias-16, %l7
++      ldxa            [%l7]0x88, %g2
++      add             %fp, $::bias-8, %l7
++      ldxa            [%l7]0x88, %g3          ! %g3:%g2 is tweak
++
++      sethi           %hi(0x76543210), %l7
++      or              %l7, %lo(0x76543210), %l7
++      bmask           %l7, %g0, %g0           ! byte swap mask
++
++      prefetch        [$inp], 20
++      prefetch        [$inp + 63], 20
++      call            _${alg}${bits}_load_${dir}ckey
++      and             $len, 15,  $rem
++      and             $len, -16, $len
++___
++$code.=<<___ if ($dir eq "de");
++      mov             0, %l7
++      movrnz          $rem, 16,  %l7
++      sub             $len, %l7, $len
++___
++$code.=<<___;
++
++      sub             $inp, $out, $blk_init   ! $inp!=$out
++      and             $inp, 7, $ileft
++      andn            $inp, 7, $inp
++      sll             $ileft, 3, $ileft
++      mov             64, $iright
++      mov             0xff, $omask
++      sub             $iright, $ileft, $iright
++      and             $out, 7, $ooff
++      cmp             $len, 255
++      movrnz          $ooff, 0, $blk_init             ! if (  $out&7 ||
++      movleu          $::size_t_cc, 0, $blk_init      !       $len<256 ||
++      brnz,pn         $blk_init, .L${bits}_xts_${dir}blk !    $inp==$out)
++      srl             $omask, $ooff, $omask
++
++      andcc           $len, 16, %g0           ! is number of blocks even?
++___
++$code.=<<___ if ($dir eq "de");
++      brz,pn          $len, .L${bits}_xts_${dir}steal
++___
++$code.=<<___;
++      alignaddrl      $out, %g0, $out
++      bz              %icc, .L${bits}_xts_${dir}loop2x
++      srlx            $len, 4, $len
++.L${bits}_xts_${dir}loop:
++      ldx             [$inp + 0], %o0
++      brz,pt          $ileft, 4f
++      ldx             [$inp + 8], %o1
++
++      ldx             [$inp + 16], %o2
++      sllx            %o0, $ileft, %o0
++      srlx            %o1, $iright, %g1
++      sllx            %o1, $ileft, %o1
++      or              %g1, %o0, %o0
++      srlx            %o2, $iright, %o2
++      or              %o2, %o1, %o1
++4:
++      movxtod         %g2, %f12
++      movxtod         %g3, %f14
++      bshuffle        %f12, %f12, %f12
++      bshuffle        %f14, %f14, %f14
++
++      xor             %g4, %o0, %o0           ! ^= rk[0]
++      xor             %g5, %o1, %o1
++      movxtod         %o0, %f0
++      movxtod         %o1, %f2
++
++      fxor            %f12, %f0, %f0          ! ^= tweak[0]
++      fxor            %f14, %f2, %f2
++
++      prefetch        [$out + 63], 22
++      prefetch        [$inp + 16+63], 20
++      call            _${alg}${bits}_${dir}crypt_1x
++      add             $inp, 16, $inp
++
++      fxor            %f12, %f0, %f0          ! ^= tweak[0]
++      fxor            %f14, %f2, %f2
++
++      srax            %g3, 63, %l7            ! next tweak value
++      addcc           %g2, %g2, %g2
++      and             %l7, 0x87, %l7
++      addxc           %g3, %g3, %g3
++      xor             %l7, %g2, %g2
++
++      brnz,pn         $ooff, 2f
++      sub             $len, 1, $len
++              
++      std             %f0, [$out + 0]
++      std             %f2, [$out + 8]
++      brnz,pt         $len, .L${bits}_xts_${dir}loop2x
++      add             $out, 16, $out
++
++      brnz,pn         $rem, .L${bits}_xts_${dir}steal
++      nop
++
++      ret
++      restore
++
++.align        16
++2:    ldxa            [$inp]0x82, %o0         ! avoid read-after-write hazard
++                                              ! and ~3x deterioration
++                                              ! in inp==out case
++      faligndata      %f0, %f0, %f4           ! handle unaligned output
++      faligndata      %f0, %f2, %f6
++      faligndata      %f2, %f2, %f8
++      stda            %f4, [$out + $omask]0xc0        ! partial store
++      std             %f6, [$out + 8]
++      add             $out, 16, $out
++      orn             %g0, $omask, $omask
++      stda            %f8, [$out + $omask]0xc0        ! partial store
++
++      brnz,pt         $len, .L${bits}_xts_${dir}loop2x+4
++      orn             %g0, $omask, $omask
++
++      brnz,pn         $rem, .L${bits}_xts_${dir}steal
++      nop
++
++      ret
++      restore
++
++!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
++.align        32
++.L${bits}_xts_${dir}loop2x:
++      ldx             [$inp + 0], %o0
++      ldx             [$inp + 8], %o1
++      ldx             [$inp + 16], %o2
++      brz,pt          $ileft, 4f
++      ldx             [$inp + 24], %o3
++
++      ldx             [$inp + 32], %o4
++      sllx            %o0, $ileft, %o0
++      srlx            %o1, $iright, %g1
++      or              %g1, %o0, %o0
++      sllx            %o1, $ileft, %o1
++      srlx            %o2, $iright, %g1
++      or              %g1, %o1, %o1
++      sllx            %o2, $ileft, %o2
++      srlx            %o3, $iright, %g1
++      or              %g1, %o2, %o2
++      sllx            %o3, $ileft, %o3
++      srlx            %o4, $iright, %o4
++      or              %o4, %o3, %o3
++4:
++      movxtod         %g2, %f12
++      movxtod         %g3, %f14
++      bshuffle        %f12, %f12, %f12
++      bshuffle        %f14, %f14, %f14
++
++      srax            %g3, 63, %l7            ! next tweak value
++      addcc           %g2, %g2, %g2
++      and             %l7, 0x87, %l7
++      addxc           %g3, %g3, %g3
++      xor             %l7, %g2, %g2
++
++      movxtod         %g2, %f8
++      movxtod         %g3, %f10
++      bshuffle        %f8,  %f8,  %f8
++      bshuffle        %f10, %f10, %f10
++
++      xor             %g4, %o0, %o0           ! ^= rk[0]
++      xor             %g5, %o1, %o1
++      xor             %g4, %o2, %o2           ! ^= rk[0]
++      xor             %g5, %o3, %o3
++      movxtod         %o0, %f0
++      movxtod         %o1, %f2
++      movxtod         %o2, %f4
++      movxtod         %o3, %f6
++
++      fxor            %f12, %f0, %f0          ! ^= tweak[0]
++      fxor            %f14, %f2, %f2
++      fxor            %f8,  %f4, %f4          ! ^= tweak[0]
++      fxor            %f10, %f6, %f6
++
++      prefetch        [$out + 63], 22
++      prefetch        [$inp + 32+63], 20
++      call            _${alg}${bits}_${dir}crypt_2x
++      add             $inp, 32, $inp
++
++      movxtod         %g2, %f8
++      movxtod         %g3, %f10
++
++      srax            %g3, 63, %l7            ! next tweak value
++      addcc           %g2, %g2, %g2
++      and             %l7, 0x87, %l7
++      addxc           %g3, %g3, %g3
++      xor             %l7, %g2, %g2
++
++      bshuffle        %f8,  %f8,  %f8
++      bshuffle        %f10, %f10, %f10
++
++      fxor            %f12, %f0, %f0          ! ^= tweak[0]
++      fxor            %f14, %f2, %f2
++      fxor            %f8,  %f4, %f4
++      fxor            %f10, %f6, %f6
++
++      brnz,pn         $ooff, 2f
++      sub             $len, 2, $len
++              
++      std             %f0, [$out + 0]
++      std             %f2, [$out + 8]
++      std             %f4, [$out + 16]
++      std             %f6, [$out + 24]
++      brnz,pt         $len, .L${bits}_xts_${dir}loop2x
++      add             $out, 32, $out
++
++      fsrc2           %f4, %f0
++      fsrc2           %f6, %f2
++      brnz,pn         $rem, .L${bits}_xts_${dir}steal
++      nop
++
++      ret
++      restore
++
++.align        16
++2:    ldxa            [$inp]0x82, %o0         ! avoid read-after-write hazard
++                                              ! and ~3x deterioration
++                                              ! in inp==out case
++      faligndata      %f0, %f0, %f8           ! handle unaligned output
++      faligndata      %f0, %f2, %f10
++      faligndata      %f2, %f4, %f12
++      faligndata      %f4, %f6, %f14
++      faligndata      %f6, %f6, %f0
++
++      stda            %f8, [$out + $omask]0xc0        ! partial store
++      std             %f10, [$out + 8]
++      std             %f12, [$out + 16]
++      std             %f14, [$out + 24]
++      add             $out, 32, $out
++      orn             %g0, $omask, $omask
++      stda            %f0, [$out + $omask]0xc0        ! partial store
++
++      brnz,pt         $len, .L${bits}_xts_${dir}loop2x+4
++      orn             %g0, $omask, $omask
++
++      fsrc2           %f4, %f0
++      fsrc2           %f6, %f2
++      brnz,pn         $rem, .L${bits}_xts_${dir}steal
++      nop
++
++      ret
++      restore
++
++!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
++.align        32
++.L${bits}_xts_${dir}blk:
++      add     $out, $len, $blk_init
++      and     $blk_init, 63, $blk_init        ! tail
++      sub     $len, $blk_init, $len
++      add     $blk_init, 15, $blk_init        ! round up to 16n
++      srlx    $len, 4, $len
++      srl     $blk_init, 4, $blk_init
++      sub     $len, 1, $len
++      add     $blk_init, 1, $blk_init
++
++.L${bits}_xts_${dir}blk2x:
++      ldx             [$inp + 0], %o0
++      ldx             [$inp + 8], %o1
++      ldx             [$inp + 16], %o2
++      brz,pt          $ileft, 5f
++      ldx             [$inp + 24], %o3
++
++      ldx             [$inp + 32], %o4
++      sllx            %o0, $ileft, %o0
++      srlx            %o1, $iright, %g1
++      or              %g1, %o0, %o0
++      sllx            %o1, $ileft, %o1
++      srlx            %o2, $iright, %g1
++      or              %g1, %o1, %o1
++      sllx            %o2, $ileft, %o2
++      srlx            %o3, $iright, %g1
++      or              %g1, %o2, %o2
++      sllx            %o3, $ileft, %o3
++      srlx            %o4, $iright, %o4
++      or              %o4, %o3, %o3
++5:
++      movxtod         %g2, %f12
++      movxtod         %g3, %f14
++      bshuffle        %f12, %f12, %f12
++      bshuffle        %f14, %f14, %f14
++
++      srax            %g3, 63, %l7            ! next tweak value
++      addcc           %g2, %g2, %g2
++      and             %l7, 0x87, %l7
++      addxc           %g3, %g3, %g3
++      xor             %l7, %g2, %g2
++
++      movxtod         %g2, %f8
++      movxtod         %g3, %f10
++      bshuffle        %f8,  %f8,  %f8
++      bshuffle        %f10, %f10, %f10
++
++      xor             %g4, %o0, %o0           ! ^= rk[0]
++      xor             %g5, %o1, %o1
++      xor             %g4, %o2, %o2           ! ^= rk[0]
++      xor             %g5, %o3, %o3
++      movxtod         %o0, %f0
++      movxtod         %o1, %f2
++      movxtod         %o2, %f4
++      movxtod         %o3, %f6
++
++      fxor            %f12, %f0, %f0          ! ^= tweak[0]
++      fxor            %f14, %f2, %f2
++      fxor            %f8,  %f4, %f4          ! ^= tweak[0]
++      fxor            %f10, %f6, %f6
++
++      prefetch        [$inp + 32+63], 20
++      call            _${alg}${bits}_${dir}crypt_2x
++      add             $inp, 32, $inp
++
++      movxtod         %g2, %f8
++      movxtod         %g3, %f10
++
++      srax            %g3, 63, %l7            ! next tweak value
++      addcc           %g2, %g2, %g2
++      and             %l7, 0x87, %l7
++      addxc           %g3, %g3, %g3
++      xor             %l7, %g2, %g2
++
++      bshuffle        %f8,  %f8,  %f8
++      bshuffle        %f10, %f10, %f10
++
++      fxor            %f12, %f0, %f0          ! ^= tweak[0]
++      fxor            %f14, %f2, %f2
++      fxor            %f8,  %f4, %f4
++      fxor            %f10, %f6, %f6
++
++      stda            %f0, [$out]0xe2         ! ASI_BLK_INIT, T4-specific
++      add             $out, 8, $out
++      stda            %f2, [$out]0xe2         ! ASI_BLK_INIT, T4-specific
++      add             $out, 8, $out
++      stda            %f4, [$out]0xe2         ! ASI_BLK_INIT, T4-specific
++      add             $out, 8, $out
++      stda            %f6, [$out]0xe2         ! ASI_BLK_INIT, T4-specific
++      bgu,pt          $::size_t_cc, .L${bits}_xts_${dir}blk2x
++      add             $out, 8, $out
++
++      add             $blk_init, $len, $len
++      andcc           $len, 1, %g0            ! is number of blocks even?
++      membar          #StoreLoad|#StoreStore
++      bnz,pt          %icc, .L${bits}_xts_${dir}loop
++      srl             $len, 0, $len
++      brnz,pn         $len, .L${bits}_xts_${dir}loop2x
++      nop
++
++      fsrc2           %f4, %f0
++      fsrc2           %f6, %f2
++      brnz,pn         $rem, .L${bits}_xts_${dir}steal
++      nop
++
++      ret
++      restore
++!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
++___
++$code.=<<___ if ($dir eq "en");
++.align        32
++.L${bits}_xts_${dir}steal:
++      std             %f0, [%fp + $::bias-16] ! copy of output
++      std             %f2, [%fp + $::bias-8]
++
++      srl             $ileft, 3, $ileft
++      add             %fp, $::bias-16, %l7
++      add             $inp, $ileft, $inp      ! original $inp+$len&-15
++      add             $out, $ooff, $out       ! original $out+$len&-15
++      mov             0, $ileft
++      nop                                     ! align
++
++.L${bits}_xts_${dir}stealing:
++      ldub            [$inp + $ileft], %o0
++      ldub            [%l7  + $ileft], %o1
++      dec             $rem
++      stb             %o0, [%l7  + $ileft]
++      stb             %o1, [$out + $ileft]
++      brnz            $rem, .L${bits}_xts_${dir}stealing
++      inc             $ileft
++
++      mov             %l7, $inp
++      sub             $out, 16, $out
++      mov             0, $ileft
++      sub             $out, $ooff, $out
++      ba              .L${bits}_xts_${dir}loop        ! one more time
++      mov             1, $len                         ! $rem is 0
++___
++$code.=<<___ if ($dir eq "de");
++.align        32
++.L${bits}_xts_${dir}steal:
++      ldx             [$inp + 0], %o0
++      brz,pt          $ileft, 8f
++      ldx             [$inp + 8], %o1
++
++      ldx             [$inp + 16], %o2
++      sllx            %o0, $ileft, %o0
++      srlx            %o1, $iright, %g1
++      sllx            %o1, $ileft, %o1
++      or              %g1, %o0, %o0
++      srlx            %o2, $iright, %o2
++      or              %o2, %o1, %o1
++8:
++      srax            %g3, 63, %l7            ! next tweak value
++      addcc           %g2, %g2, %o2
++      and             %l7, 0x87, %l7
++      addxc           %g3, %g3, %o3
++      xor             %l7, %o2, %o2
++
++      movxtod         %o2, %f12
++      movxtod         %o3, %f14
++      bshuffle        %f12, %f12, %f12
++      bshuffle        %f14, %f14, %f14
++
++      xor             %g4, %o0, %o0           ! ^= rk[0]
++      xor             %g5, %o1, %o1
++      movxtod         %o0, %f0
++      movxtod         %o1, %f2
++
++      fxor            %f12, %f0, %f0          ! ^= tweak[0]
++      fxor            %f14, %f2, %f2
++
++      call            _${alg}${bits}_${dir}crypt_1x
++      add             $inp, 16, $inp
++
++      fxor            %f12, %f0, %f0          ! ^= tweak[0]
++      fxor            %f14, %f2, %f2
++
++      std             %f0, [%fp + $::bias-16]
++      std             %f2, [%fp + $::bias-8]
++
++      srl             $ileft, 3, $ileft
++      add             %fp, $::bias-16, %l7
++      add             $inp, $ileft, $inp      ! original $inp+$len&-15
++      add             $out, $ooff, $out       ! original $out+$len&-15
++      mov             0, $ileft
++      add             $out, 16, $out
++      nop                                     ! align
++
++.L${bits}_xts_${dir}stealing:
++      ldub            [$inp + $ileft], %o0
++      ldub            [%l7  + $ileft], %o1
++      dec             $rem
++      stb             %o0, [%l7  + $ileft]
++      stb             %o1, [$out + $ileft]
++      brnz            $rem, .L${bits}_xts_${dir}stealing
++      inc             $ileft
++
++      mov             %l7, $inp
++      sub             $out, 16, $out
++      mov             0, $ileft
++      sub             $out, $ooff, $out
++      ba              .L${bits}_xts_${dir}loop        ! one more time
++      mov             1, $len                         ! $rem is 0
++___
++$code.=<<___;
++      ret
++      restore
++.type ${alg}${bits}_t4_xts_${dir}crypt,#function
++.size ${alg}${bits}_t4_xts_${dir}crypt,.-${alg}${bits}_t4_xts_${dir}crypt
++___
++}
++
++# Purpose of these subroutines is to explicitly encode VIS instructions,
++# so that one can compile the module without having to specify VIS
++# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
++# Idea is to reserve for option to produce "universal" binary and let
++# programmer detect if current CPU is VIS capable at run-time.
++sub unvis {
++my ($mnemonic,$rs1,$rs2,$rd)=@_;
++my ($ref,$opf);
++my %visopf = (        "faligndata"    => 0x048,
++              "bshuffle"      => 0x04c,
++              "fnot2"         => 0x066,
++              "fxor"          => 0x06c,
++              "fsrc2"         => 0x078        );
++
++    $ref = "$mnemonic\t$rs1,$rs2,$rd";
++
++    if ($opf=$visopf{$mnemonic}) {
++      foreach ($rs1,$rs2,$rd) {
++          return $ref if (!/%f([0-9]{1,2})/);
++          $_=$1;
++          if ($1>=32) {
++              return $ref if ($1&1);
++              # re-encode for upper double register addressing
++              $_=($1|$1>>5)&31;
++          }
++      }
++
++      return  sprintf ".word\t0x%08x !%s",
++                      0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
++                      $ref;
++    } else {
++      return $ref;
++    }
++}
++
++sub unvis3 {
++my ($mnemonic,$rs1,$rs2,$rd)=@_;
++my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
++my ($ref,$opf);
++my %visopf = (        "addxc"         => 0x011,
++              "addxccc"       => 0x013,
++              "umulxhi"       => 0x016,
++              "alignaddr"     => 0x018,
++              "bmask"         => 0x019,
++              "alignaddrl"    => 0x01a        );
++
++    $ref = "$mnemonic\t$rs1,$rs2,$rd";
++
++    if ($opf=$visopf{$mnemonic}) {
++      foreach ($rs1,$rs2,$rd) {
++          return $ref if (!/%([goli])([0-9])/);
++          $_=$bias{$1}+$2;
++      }
++
++      return  sprintf ".word\t0x%08x !%s",
++                      0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
++                      $ref;
++    } else {
++      return $ref;
++    }
++}
++
++sub unaes_round {     # 4-argument instructions
++my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
++my ($ref,$opf);
++my %aesopf = (        "aes_eround01"  => 0,
++              "aes_eround23"  => 1,
++              "aes_dround01"  => 2,
++              "aes_dround23"  => 3,
++              "aes_eround01_l"=> 4,
++              "aes_eround23_l"=> 5,
++              "aes_dround01_l"=> 6,
++              "aes_dround23_l"=> 7,
++              "aes_kexpand1"  => 8    );
++
++    $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
++
++    if (defined($opf=$aesopf{$mnemonic})) {
++      $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
++      foreach ($rs1,$rs2,$rd) {
++          return $ref if (!/%f([0-9]{1,2})/);
++          $_=$1;
++          if ($1>=32) {
++              return $ref if ($1&1);
++              # re-encode for upper double register addressing
++              $_=($1|$1>>5)&31;
++          }
++      }
++
++      return  sprintf ".word\t0x%08x !%s",
++                      2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
++                      $ref;
++    } else {
++      return $ref;
++    }
++}
++
++sub unaes_kexpand {   # 3-argument instructions
++my ($mnemonic,$rs1,$rs2,$rd)=@_;
++my ($ref,$opf);
++my %aesopf = (        "aes_kexpand0"  => 0x130,
++              "aes_kexpand2"  => 0x131        );
++
++    $ref = "$mnemonic\t$rs1,$rs2,$rd";
++
++    if (defined($opf=$aesopf{$mnemonic})) {
++      foreach ($rs1,$rs2,$rd) {
++          return $ref if (!/%f([0-9]{1,2})/);
++          $_=$1;
++          if ($1>=32) {
++              return $ref if ($1&1);
++              # re-encode for upper double register addressing
++              $_=($1|$1>>5)&31;
++          }
++      }
++
++      return  sprintf ".word\t0x%08x !%s",
++                      2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
++                      $ref;
++    } else {
++      return $ref;
++    }
++}
++
++sub uncamellia_f {    # 4-argument instructions
++my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
++my ($ref,$opf);
++
++    $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
++
++    if (1) {
++      $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
++      foreach ($rs1,$rs2,$rd) {
++          return $ref if (!/%f([0-9]{1,2})/);
++          $_=$1;
++          if ($1>=32) {
++              return $ref if ($1&1);
++              # re-encode for upper double register addressing
++              $_=($1|$1>>5)&31;
++          }
++      }
++
++      return  sprintf ".word\t0x%08x !%s",
++                      2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|0xc<<5|$rs2,
++                      $ref;
++    } else {
++      return $ref;
++    }
++}
++
++sub uncamellia3 {     # 3-argument instructions
++my ($mnemonic,$rs1,$rs2,$rd)=@_;
++my ($ref,$opf);
++my %cmllopf = (       "camellia_fl"   => 0x13c,
++              "camellia_fli"  => 0x13d        );
++
++    $ref = "$mnemonic\t$rs1,$rs2,$rd";
++
++    if (defined($opf=$cmllopf{$mnemonic})) {
++      foreach ($rs1,$rs2,$rd) {
++          return $ref if (!/%f([0-9]{1,2})/);
++          $_=$1;
++          if ($1>=32) {
++              return $ref if ($1&1);
++              # re-encode for upper double register addressing
++              $_=($1|$1>>5)&31;
++          }
++      }
++
++      return  sprintf ".word\t0x%08x !%s",
++                      2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
++                      $ref;
++    } else {
++      return $ref;
++    }
++}
++
++sub unmovxtox {               # 2-argument instructions
++my ($mnemonic,$rs,$rd)=@_;
++my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24, "f" => 0 );
++my ($ref,$opf);
++my %movxopf = (       "movdtox"       => 0x110,
++              "movstouw"      => 0x111,
++              "movstosw"      => 0x113,
++              "movxtod"       => 0x118,
++              "movwtos"       => 0x119        );
++
++    $ref = "$mnemonic\t$rs,$rd";
++
++    if (defined($opf=$movxopf{$mnemonic})) {
++      foreach ($rs,$rd) {
++          return $ref if (!/%([fgoli])([0-9]{1,2})/);
++          $_=$bias{$1}+$2;
++          if ($2>=32) {
++              return $ref if ($2&1);
++              # re-encode for upper double register addressing
++              $_=($2|$2>>5)&31;
++          }
++      }
++
++      return  sprintf ".word\t0x%08x !%s",
++                      2<<30|$rd<<25|0x36<<19|$opf<<5|$rs,
++                      $ref;
++    } else {
++      return $ref;
++    }
++}
++
++sub undes {
++my ($mnemonic)=shift;
++my @args=@_;
++my ($ref,$opf);
++my %desopf = (        "des_round"     => 0b1001,
++              "des_ip"        => 0b100110100,
++              "des_iip"       => 0b100110101,
++              "des_kexpand"   => 0b100110110  );
++
++    $ref = "$mnemonic\t".join(",",@_);
++
++    if (defined($opf=$desopf{$mnemonic})) {   # 4-arg
++      if ($mnemonic eq "des_round") {
++          foreach (@args[0..3]) {
++              return $ref if (!/%f([0-9]{1,2})/);
++              $_=$1;
++              if ($1>=32) {
++                  return $ref if ($1&1);
++                  # re-encode for upper double register addressing
++                  $_=($1|$1>>5)&31;
++              }
++          }
++          return  sprintf ".word\t0x%08x !%s",
++                          
2<<30|0b011001<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<9|$args[3]<<25,
++                          $ref;
++      } elsif ($mnemonic eq "des_kexpand") {  # 3-arg
++          foreach (@args[0..2]) {
++              return $ref if (!/(%f)?([0-9]{1,2})/);
++              $_=$2;
++              if ($2>=32) {
++                  return $ref if ($2&1);
++                  # re-encode for upper double register addressing
++                  $_=($2|$2>>5)&31;
++              }
++          }
++          return  sprintf ".word\t0x%08x !%s",
++                          
2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<25,
++                          $ref;
++      } else {                                # 2-arg
++          foreach (@args[0..1]) {
++              return $ref if (!/%f([0-9]{1,2})/);
++              $_=$1;
++              if ($1>=32) {
++                  return $ref if ($2&1);
++                  # re-encode for upper double register addressing
++                  $_=($1|$1>>5)&31;
++              }
++          }
++          return  sprintf ".word\t0x%08x !%s",
++                          
2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]<<25,
++                          $ref;
++      }
++    } else {
++      return $ref;
++    }
++}
++
++sub emit_assembler {
++    foreach (split("\n",$::code)) {
++      s/\`([^\`]*)\`/eval $1/ge;
++
++      
s/\b(f[a-z]+2[sd]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})\s*$/$1\t%f0,$2,$3/go;
++
++      
s/\b(aes_[edk][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
++              &unaes_round($1,$2,$3,$4,$5)
++       /geo or
++      
s/\b(aes_kexpand[02])\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
++              &unaes_kexpand($1,$2,$3,$4)
++       /geo or
++      
s/\b(camellia_f)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
++              &uncamellia_f($1,$2,$3,$4,$5)
++       /geo or
++      
s/\b(camellia_[^s]+)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
++              &uncamellia3($1,$2,$3,$4)
++       /geo or
++      
s/\b(des_\w+)\s+(?<rs1>%f[0-9]{1,2}),\s*(?<rs2>[%fx0-9]+)(,\s*(?<rs3>%f[0-9]{1,2})(,\s*(?<rs4>%f[0-9]{1,2}))?)?/
++              &undes($1,$+{rs1},$+{rs2},$+{rs3},$+{rs4})
++       /geo or
++      s/\b(mov[ds]to\w+)\s+(%f[0-9]{1,2}),\s*(%[goli][0-7])/
++              &unmovxtox($1,$2,$3)
++       /geo or
++      s/\b(mov[xw]to[ds])\s+(%[goli][0-7]),\s*(%f[0-9]{1,2})/
++              &unmovxtox($1,$2,$3)
++       /geo or
++      s/\b([fb][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
++              &unvis($1,$2,$3,$4)
++       /geo or
++      
s/\b(umulxhi|bmask|addxc[c]{0,2}|alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
++              &unvis3($1,$2,$3,$4)
++       /geo;
++
++      print $_,"\n";
++    }
++}
++
++1;
+Index: crypto/bn/asm/vis3-mont.pl
+===================================================================
+diff -uNr openssl-1.0.1m/crypto/bn/asm/vis3-mont.pl 
openssl-1.0.1m/crypto/bn/asm/vis3-mont.pl
+--- openssl-1.0.1m/crypto/bn/asm/vis3-mont.pl 1970-01-01 01:00:00.000000000 
+0100
++++ openssl-1.0.1m/crypto/bn/asm/vis3-mont.pl 2015-06-02 09:52:11.809148396 
+0200
+@@ -0,0 +1,373 @@
++#!/usr/bin/env perl
++
++# ====================================================================
++# Written by Andy Polyakov <[email protected]> for the OpenSSL
++# project. The module is, however, dual licensed under OpenSSL and
++# CRYPTOGAMS licenses depending on where you obtain it. For further
++# details see http://www.openssl.org/~appro/cryptogams/.
++# ====================================================================
++
++# October 2012.
++#
++# SPARCv9 VIS3 Montgomery multiplicaion procedure suitable for T3 and
++# onward. There are three new instructions used here: umulxhi,
++# addxc[cc] and initializing store. On T3 RSA private key operations
++# are 1.54/1.87/2.11/2.26 times faster for 512/1024/2048/4096-bit key
++# lengths. This is without dedicated squaring procedure. On T4
++# corresponding coefficients are 1.47/2.10/2.80/2.90x, which is mostly
++# for reference purposes, because T4 has dedicated Montgomery
++# multiplication and squaring *instructions* that deliver even more.
++
++$bits=32;
++for (@ARGV)     { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
++if ($bits==64)  { $bias=2047; $frame=192; }
++else            { $bias=0;    $frame=112; }
++
++$code.=<<___ if ($bits==64);
++.register     %g2,#scratch
++.register     %g3,#scratch
++___
++$code.=<<___;
++.section      ".text",#alloc,#execinstr
++___
++
++($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)=
++      (map("%g$_",(1..5)),map("%o$_",(0..5,7)));
++
++# int bn_mul_mont(
++$rp="%o0";    # BN_ULONG *rp,
++$ap="%o1";    # const BN_ULONG *ap,
++$bp="%o2";    # const BN_ULONG *bp,
++$np="%o3";    # const BN_ULONG *np,
++$n0p="%o4";   # const BN_ULONG *n0,
++$num="%o5";   # int num);     # caller ensures that num is even
++                              # and >=6
++$code.=<<___;
++.globl        bn_mul_mont_vis3
++.align        32
++bn_mul_mont_vis3:
++      add     %sp,    $bias,  %g4     ! real top of stack
++      sll     $num,   2,      $num    ! size in bytes
++      add     $num,   63,     %g5
++      andn    %g5,    63,     %g5     ! buffer size rounded up to 64 bytes
++      add     %g5,    %g5,    %g1
++      add     %g5,    %g1,    %g1     ! 3*buffer size
++      sub     %g4,    %g1,    %g1
++      andn    %g1,    63,     %g1     ! align at 64 byte
++      sub     %g1,    $frame, %g1     ! new top of stack
++      sub     %g1,    %g4,    %g1
++
++      save    %sp,    %g1,    %sp
++___
++
++#     +-------------------------------+<----- %sp
++#     .                               .
++#     +-------------------------------+<----- aligned at 64 bytes
++#     | __int64 tmp[0]                |
++#     +-------------------------------+
++#     .                               .
++#     .                               .
++#     +-------------------------------+<----- aligned at 64 bytes
++#     | __int64 ap[1..0]              |       converted ap[]
++#     +-------------------------------+
++#     | __int64 np[1..0]              |       converted np[]
++#     +-------------------------------+
++#     | __int64 ap[3..2]              |
++#     .                               .
++#     .                               .
++#     +-------------------------------+
++($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
++($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$anp)=map("%l$_",(0..7));
++($ovf,$i)=($t0,$t1);
++$code.=<<___;
++      ld      [$n0p+0],       $t0     ! pull n0[0..1] value
++      add     %sp, $bias+$frame, $tp
++      ld      [$n0p+4],       $t1
++      add     $tp,    %g5,    $anp
++      ld      [$bp+0],        $t2     ! m0=bp[0]
++      sllx    $t1,    32,     $n0
++      ld      [$bp+4],        $t3
++      or      $t0,    $n0,    $n0
++      add     $bp,    8,      $bp
++
++      ld      [$ap+0],        $t0     ! ap[0]
++      sllx    $t3,    32,     $m0
++      ld      [$ap+4],        $t1
++      or      $t2,    $m0,    $m0
++
++      ld      [$ap+8],        $t2     ! ap[1]
++      sllx    $t1,    32,     $aj
++      ld      [$ap+12],       $t3
++      or      $t0,    $aj,    $aj
++      add     $ap,    16,     $ap
++      stxa    $aj,    [$anp]0xe2      ! converted ap[0]
++
++      mulx    $aj,    $m0,    $lo0    ! ap[0]*bp[0]
++      umulxhi $aj,    $m0,    $hi0
++
++      ld      [$np+0],        $t0     ! np[0]
++      sllx    $t3,    32,     $aj
++      ld      [$np+4],        $t1
++      or      $t2,    $aj,    $aj
++
++      ld      [$np+8],        $t2     ! np[1]
++      sllx    $t1,    32,     $nj
++      ld      [$np+12],       $t3
++      or      $t0, $nj,       $nj
++      add     $np,    16,     $np
++      stx     $nj,    [$anp+8]        ! converted np[0]
++
++      mulx    $lo0,   $n0,    $m1     ! "tp[0]"*n0
++      stx     $aj,    [$anp+16]       ! converted ap[1]
++
++      mulx    $aj,    $m0,    $alo    ! ap[1]*bp[0]
++      umulxhi $aj,    $m0,    $aj     ! ahi=aj
++
++      mulx    $nj,    $m1,    $lo1    ! np[0]*m1
++      umulxhi $nj,    $m1,    $hi1
++
++      sllx    $t3,    32,     $nj
++      or      $t2,    $nj,    $nj
++      stx     $nj,    [$anp+24]       ! converted np[1]
++      add     $anp,   32,     $anp
++
++      addcc   $lo0,   $lo1,   $lo1
++      addxc   %g0,    $hi1,   $hi1
++
++      mulx    $nj,    $m1,    $nlo    ! np[1]*m1
++      umulxhi $nj,    $m1,    $nj     ! nhi=nj
++
++      ba      .L1st
++      sub     $num,   24,     $cnt    ! cnt=num-3
++
++.align        16
++.L1st:
++      ld      [$ap+0],        $t0     ! ap[j]
++      addcc   $alo,   $hi0,   $lo0
++      ld      [$ap+4],        $t1
++      addxc   $aj,    %g0,    $hi0
++
++      sllx    $t1,    32,     $aj
++      add     $ap,    8,      $ap
++      or      $t0,    $aj,    $aj
++      stxa    $aj,    [$anp]0xe2      ! converted ap[j]
++
++      ld      [$np+0],        $t2     ! np[j]
++      addcc   $nlo,   $hi1,   $lo1
++      ld      [$np+4],        $t3
++      addxc   $nj,    %g0,    $hi1    ! nhi=nj
++
++      sllx    $t3,    32,     $nj
++      add     $np,    8,      $np
++      mulx    $aj,    $m0,    $alo    ! ap[j]*bp[0]
++      or      $t2,    $nj,    $nj
++      umulxhi $aj,    $m0,    $aj     ! ahi=aj
++      stx     $nj,    [$anp+8]        ! converted np[j]
++      add     $anp,   16,     $anp    ! anp++
++

@@ Diff output truncated at 100000 characters. @@
This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.

SF.net SVN: gar:[25051] csw/mgar/pkg/openssl1/trunk/files

Reply via email to