Revision: 25050
          http://sourceforge.net/p/gar/code/25050
Author:   janholzh
Date:     2015-06-02 07:41:26 +0000 (Tue, 02 Jun 2015)
Log Message:
-----------
openssl1/trunk: update more patches

Modified Paths:
--------------
    csw/mgar/pkg/openssl1/trunk/files/openssl-1.0.1m-t4-engine.sparc.5.11.patch
    csw/mgar/pkg/openssl1/trunk/files/openssl-1.0.1m-wanboot.patch

Modified: 
csw/mgar/pkg/openssl1/trunk/files/openssl-1.0.1m-t4-engine.sparc.5.11.patch
===================================================================
--- csw/mgar/pkg/openssl1/trunk/files/openssl-1.0.1m-t4-engine.sparc.5.11.patch 
2015-06-02 06:06:13 UTC (rev 25049)
+++ csw/mgar/pkg/openssl1/trunk/files/openssl-1.0.1m-t4-engine.sparc.5.11.patch 
2015-06-02 07:41:26 UTC (rev 25050)
@@ -13,9 +13,9 @@
  my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o::aes_core.o aes_cbc.o 
aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o 
rc4_skey.o:::::ghash-ia64.o::void";
 -my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o 
sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o 
aes-sparcv9.o:::sha1-sparcv9.o sha256-sparcv9.o 
sha512-sparcv9.o:::::::ghash-sparcv9.o::void";
 +my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o 
sparcv9a-mont.o vis3-mont.o sparct4-mont.o sparcv9-gf2m.o:des_enc-sparc.o 
fcrypt_b.o dest4-sparcv9.o:aes_core.o aes_cbc.o aes-sparcv9.o 
aest4-sparcv9.o::md5-sparcv9.o:sha1-sparcv9.o sha256-sparcv9.o 
sha512-sparcv9.o:::::::ghash-sparcv9.o::void";
+ my $fips_sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o 
sparcv9-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o 
aes-sparcv9.o:::sha1-sparcv9.o sha256-sparcv9.o 
sha512-sparcv9.o:::::::ghash-sparcv9.o::void";
  my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::::::void";
  my $alpha_asm="alphacpuid.o:bn_asm.o 
alpha-mont.o:::::sha1-alpha.o:::::::ghash-alpha.o::void";
- my $mips32_asm=":bn-mips.o::aes_cbc.o aes-mips.o:::sha1-mips.o 
sha256-mips.o::::::::";
 Index: crypto/sparccpuid.S
 ===================================================================
 diff -ru openssl-1.0.1e/crypto/sparccpuid.S openssl-1.0.1e/crypto/sparccpuid.S
@@ -29,20 +29,7 @@
  #if defined(__SUNPRO_C) && defined(__sparcv9)
  # define ABI64  /* They've said -xarch=v9 at command line */
  #elif defined(__GNUC__) && defined(__arch64__)
-@@ -235,10 +239,10 @@
- .global       _sparcv9_vis1_probe
- .align        8
- _sparcv9_vis1_probe:
-+      .word   0x81b00d80      !fxor   %f0,%f0,%f0
-       add     %sp,BIAS+2,%o1
--      .word   0xc19a5a40      !ldda   [%o1]ASI_FP16_P,%f0
-       retl
--      .word   0x81b00d80      !fxor   %f0,%f0,%f0
-+      .word   0xc19a5a40      !ldda   [%o1]ASI_FP16_P,%f0
- .type _sparcv9_vis1_probe,#function
- .size _sparcv9_vis1_probe,.-_sparcv9_vis1_probe
- 
-@@ -251,7 +255,12 @@
+@@ -241,7 +245,12 @@
  !     UltraSPARC IIe          7
  !     UltraSPARC III          7
  !     UltraSPARC T1           24
@@ -55,7 +42,7 @@
  ! Numbers for T2 and SPARC64 V-VII are more than welcomed.
  !
  ! It would be possible to detect specifically US-T1 by instrumenting
-@@ -260,6 +269,8 @@
+@@ -250,6 +259,8 @@
  .global       _sparcv9_vis1_instrument
  .align        8
  _sparcv9_vis1_instrument:
@@ -64,9 +51,9 @@
        .word   0x91410000      !rd     %tick,%o0
        .word   0x81b00d80      !fxor   %f0,%f0,%f0
        .word   0x85b08d82      !fxor   %f2,%f2,%f2
-@@ -314,6 +325,30 @@
- .type _sparcv9_fmadd_probe,#function
- .size _sparcv9_fmadd_probe,.-_sparcv9_fmadd_probe
+@@ -286,6 +297,30 @@
+ .type _sparcv9_vis1_instrument,#function
+ .size _sparcv9_vis1_instrument,.-_sparcv9_vis1_instrument
  
 +.global       _sparcv9_rdcfr
 +.align        8
@@ -95,7 +82,7 @@
  .global       OPENSSL_cleanse
  .align        32
  OPENSSL_cleanse:
-@@ -398,6 +433,102 @@
+@@ -370,6 +405,102 @@
  .size OPENSSL_cleanse,.-OPENSSL_cleanse
  
  #ifndef _BOOT
@@ -203,18 +190,20 @@
 diff -ru openssl-1.0.1e/crypto/sparcv9cap.c openssl-1.0.1e/crypto/sparcv9cap.c
 --- openssl-1.0.1e/crypto/sparcv9cap.c 2011-05-24 17:02:24.000000000 -0700
 +++ openssl-1.0.1e/crypto/sparcv9cap.c 2011-07-27 10:48:17.817470000 -0700
-@@ -4,34 +4,58 @@
+@@ -3,36 +3,59 @@
+ #include <string.h>
  #include <setjmp.h>
- #include <signal.h>
  #include <sys/time.h>
 +#include <unistd.h>
  #include <openssl/bn.h>
+ #include <sys/auxv.h>
  
 -#define SPARCV9_TICK_PRIVILEGED (1<<0)
 -#define SPARCV9_PREFER_FPU      (1<<1)
 -#define SPARCV9_VIS1            (1<<2)
 -#define SPARCV9_VIS2            (1<<3) /* reserved */
 -#define SPARCV9_FMADD           (1<<4) /* reserved for SPARC64 V */
+-#define SPARCV9_BLK             (1<<5)
 +#include "sparc_arch.h"
  
 +#if defined(__GNUC__) && defined(__linux)
@@ -275,13 +264,11 @@
  }
  
  unsigned long _sparcv9_rdtick(void);
-@@ -39,11 +63,18 @@
+@@ -37,11 +60,16 @@
+ 
+ unsigned long _sparcv9_rdtick(void);
  unsigned long _sparcv9_vis1_instrument(void);
- void _sparcv9_vis2_probe(void);
- void _sparcv9_fmadd_probe(void);
 +unsigned long _sparcv9_rdcfr(void);
-+void _sparcv9_vis3_probe(void);
-+unsigned long _sparcv9_random(void);
 +#ifndef _BOOT
 +size_t _sparcv9_vis1_instrument_bus(unsigned int *,size_t);
 +size_t _sparcv9_vis1_instrument_bus2(unsigned int *,size_t,size_t);
@@ -295,7 +282,7 @@
  #if defined(__sun) && defined(__SVR4)
          return gethrtime();
  #else
-@@ -52,6 +83,24 @@
+@@ -50,6 +80,24 @@
      else
          return _sparcv9_rdtick();
  }
@@ -320,7 +307,7 @@
  #endif
 
  #if defined(_BOOT)
-@@ -61,7 +110,7 @@
+@@ -59,7 +107,7 @@
   */
  void OPENSSL_cpuid_setup(void)
         {
@@ -329,7 +316,7 @@
         }
  
  #elif 0 && defined(__sun) && defined(__SVR4)
-@@ -90,11 +139,11 @@
+@@ -88,11 +136,11 @@
      if (!strcmp(name, "SUNW,UltraSPARC") ||
          /* covers II,III,IV */
          !strncmp(name, "SUNW,UltraSPARC-I", 17)) {
@@ -343,7 +330,7 @@
  
          return DI_WALK_TERMINATE;
      }
-@@ -100,7 +149,7 @@
+@@ -98,7 +146,7 @@
      }
      /* This is expected to catch remaining UltraSPARCs, such as T1 */
      else if (!strncmp(name, "SUNW,UltraSPARC", 15)) {
@@ -352,7 +339,7 @@
  
          return DI_WALK_TERMINATE;
      }
-@@ -119,7 +168,7 @@
+@@ -117,7 +165,7 @@
      trigger = 1;
  
      if ((e = getenv("OPENSSL_sparcv9cap"))) {
@@ -361,7 +348,7 @@
          return;
      }
  
-@@ -126,15 +175,15 @@
+@@ -124,15 +172,15 @@
      if (sysinfo(SI_MACHINE, si, sizeof(si)) > 0) {
          if (strcmp(si, "sun4v"))
              /* FPU is preferred for all CPUs, but US-T1/2 */
@@ -381,7 +368,7 @@
              return;
          }
      }
-@@ -204,12 +253,14 @@
+@@ -195,7 +241,9 @@
      trigger = 1;
  
      if ((e = getenv("OPENSSL_sparcv9cap"))) {
@@ -392,73 +379,48 @@
          return;
      }
  
+@@ -202,21 +250,48 @@
+     (void) getisax(&ui, 1);
+ 
      /* Initial value, fits UltraSPARC-I&II... */
--    OPENSSL_sparcv9cap_P = SPARCV9_PREFER_FPU | SPARCV9_TICK_PRIVILEGED;
-+    OPENSSL_sparcv9cap_P[0] = SPARCV9_PREFER_FPU | SPARCV9_TICK_PRIVILEGED;
+-    OPENSSL_sparcv9cap_P = SPARCV9_BLK;
++    OPENSSL_sparcv9cap_P[0] = SPARCV9_BLK;
  
-     sigfillset(&all_masked);
-     sigdelset(&all_masked, SIGILL);
-@@ -232,18 +283,18 @@
- 
-     if (sigsetjmp(common_jmp, 1) == 0) {
-         _sparcv9_rdtick();
--        OPENSSL_sparcv9cap_P &= ~SPARCV9_TICK_PRIVILEGED;
-+        OPENSSL_sparcv9cap_P[0] &= ~SPARCV9_TICK_PRIVILEGED;
+     if (ui & AV_SPARC_VIS) {
+-        /* detect UltraSPARC-Tx, see sparccpuid.S for details... */
++        /* detect UltraSPARC-Tx, see sparccpud.S for details... */
+         if (_sparcv9_vis1_instrument() < 7)
+-            OPENSSL_sparcv9cap_P |= SPARCV9_TICK_PRIVILEGED;
++            OPENSSL_sparcv9cap_P[0] |= SPARCV9_TICK_PRIVILEGED;
+         if (_sparcv9_vis1_instrument() < 12) {
+-            OPENSSL_sparcv9cap_P |= SPARCV9_VIS1|SPARCV9_PREFER_FPU;
++            OPENSSL_sparcv9cap_P[0] |= (SPARCV9_VIS1 | SPARCV9_PREFER_FPU);
+             if (ui & AV_SPARC_VIS2)
+-                OPENSSL_sparcv9cap_P |= SPARCV9_VIS2;
+-        }
++                OPENSSL_sparcv9cap_P[0] |= SPARCV9_VIS2;
++         }
      }
  
-     if (sigsetjmp(common_jmp, 1) == 0) {
-         _sparcv9_vis1_probe();
--        OPENSSL_sparcv9cap_P |= SPARCV9_VIS1;
-+        OPENSSL_sparcv9cap_P[0] |= SPARCV9_VIS1 | SPARCV9_BLK;
-         /* detect UltraSPARC-Tx, see sparccpud.S for details... */
-         if (_sparcv9_vis1_instrument() >= 12)
--            OPENSSL_sparcv9cap_P &= ~(SPARCV9_VIS1 | SPARCV9_PREFER_FPU);
-+            OPENSSL_sparcv9cap_P[0] &= ~(SPARCV9_VIS1 | SPARCV9_PREFER_FPU);
-         else {
-             _sparcv9_vis2_probe();
--            OPENSSL_sparcv9cap_P |= SPARCV9_VIS2;
-+            OPENSSL_sparcv9cap_P[0] |= SPARCV9_VIS2;
-         }
-     }
- 
-@@ -249,13 +300,50 @@
- 
-     if (sigsetjmp(common_jmp, 1) == 0) {
-         _sparcv9_fmadd_probe();
+     if (ui & AV_SPARC_FMAF)
 -        OPENSSL_sparcv9cap_P |= SPARCV9_FMADD;
 +        OPENSSL_sparcv9cap_P[0] |= SPARCV9_FMADD;
-     }
- 
++
 +    /*
 +     * VIS3 flag is tested independently from VIS1, unlike VIS2 that is,
 +     * because VIS3 defines even integer instructions.
 +     */
-+    if (sigsetjmp(common_jmp,1) == 0) {
-+        _sparcv9_vis3_probe();
-+        OPENSSL_sparcv9cap_P[0] |= SPARCV9_VIS3;
-+    }
++    if (ui & AV_SPARC_VIS3)
++            OPENSSL_sparcv9cap_P[0] |= SPARCV9_VIS3;
 +
-+    if (sigsetjmp(common_jmp,1) == 0) {
-+        (void)_sparcv9_random();
-+        OPENSSL_sparcv9cap_P[0] |= SPARCV9_RANDOM;
-+    }
++#define AV_T4_MECHS     (AV_SPARC_AES | AV_SPARC_DES | AV_SPARC_KASUMI | \
++                         AV_SPARC_CAMELLIA | AV_SPARC_MD5 | AV_SPARC_SHA1 | \
++                         AV_SPARC_SHA256 | AV_SPARC_SHA512 | AV_SPARC_MPMUL | 
\
++                         AV_SPARC_CRC32C)
 +
-+    /*
-+     * In wait for better solution _sparcv9_rdcfr is masked by
-+     * VIS3 flag, because it goes to uninterruptable endless
-+     * loop on UltraSPARC II running Solaris. Things might be
-+     * different on Linux...
-+     */
-+    if ((OPENSSL_sparcv9cap_P[0]&SPARCV9_VIS3) &&
-+        sigsetjmp(common_jmp, 1) == 0) {
++    if ((OPENSSL_sparcv9cap_P[0]&SPARCV9_VIS3) && (ui & AV_T4_MECHS))
 +        OPENSSL_sparcv9cap_P[1] = (unsigned int)_sparcv9_rdcfr();
-+    }
 +
-     sigaction(SIGBUS, &bus_oact, NULL);
-     sigaction(SIGILL, &ill_oact, NULL);
- 
-     sigprocmask(SIG_SETMASK, &oset, NULL);
-+
 +    if (sizeof(size_t) == 8)
 +        OPENSSL_sparcv9cap_P[0] |= SPARCV9_64BIT_STACK;
 +#ifdef __linux
@@ -2265,5563 +2227,3 @@
      {ERR_FUNC(EVP_F_AES_XTS), "AES_XTS"},
      {ERR_FUNC(EVP_F_AES_XTS_CIPHER), "AES_XTS_CIPHER"},
      {ERR_FUNC(EVP_F_ALG_MODULE_INIT), "ALG_MODULE_INIT"},
-Index: crypto/sparc_arch.h
-===================================================================
-diff -uNr openssl-1.0.1m/crypto/sparc_arch.h openssl-1.0.1m/crypto/sparc_arch.h
---- openssl-1.0.1m/crypto/sparc_arch.h 1970-01-01 01:00:00.000000000 +0100
-+++ openssl-1.0.1m/crypto/sparc_arch.h 2015-03-21 16:27:38.578043100 +0100
-@@ -0,0 +1,101 @@
-+#ifndef __SPARC_ARCH_H__
-+#define       __SPARC_ARCH_H__
-+
-+#define       SPARCV9_TICK_PRIVILEGED (1<<0)
-+#define       SPARCV9_PREFER_FPU      (1<<1)
-+#define       SPARCV9_VIS1            (1<<2)
-+#define       SPARCV9_VIS2            (1<<3)  /* reserved */
-+#define       SPARCV9_FMADD           (1<<4)  /* reserved for SPARC64 V */
-+#define       SPARCV9_BLK             (1<<5)  /* VIS1 block copy */
-+#define       SPARCV9_VIS3            (1<<6)
-+#define       SPARCV9_RANDOM          (1<<7)
-+#define       SPARCV9_64BIT_STACK     (1<<8)
-+
-+/*
-+ * OPENSSL_sparcv9cap_P[1] is copy of Compatibility Feature Register,
-+ * %asr26, SPARC-T4 and later. There is no SPARCV9_CFR bit in
-+ * OPENSSL_sparcv9cap_P[0], as %cfr copy is sufficient...
-+ */
-+#define       CFR_AES         0x00000001 /* Supports AES opcodes      */
-+#define       CFR_DES         0x00000002 /* Supports DES opcodes      */
-+#define       CFR_KASUMI      0x00000004 /* Supports KASUMI opcodes   */
-+#define       CFR_CAMELLIA    0x00000008 /* Supports CAMELLIA opcodes */
-+#define       CFR_MD5         0x00000010 /* Supports MD5 opcodes      */
-+#define       CFR_SHA1        0x00000020 /* Supports SHA1 opcodes     */
-+#define       CFR_SHA256      0x00000040 /* Supports SHA256 opcodes   */
-+#define       CFR_SHA512      0x00000080 /* Supports SHA512 opcodes   */
-+#define       CFR_MPMUL       0x00000100 /* Supports MPMUL opcodes    */
-+#define       CFR_MONTMUL     0x00000200 /* Supports MONTMUL opcodes  */
-+#define       CFR_MONTSQR     0x00000400 /* Supports MONTSQR opcodes  */
-+#define       CFR_CRC32C      0x00000800 /* Supports CRC32C opcodes   */
-+
-+#if defined(OPENSSL_PIC) && !defined(__PIC__)
-+#define       __PIC__
-+#endif
-+
-+#if defined(__SUNPRO_C) && defined(__sparcv9) && !defined(__arch64__)
-+#define       __arch64__
-+#endif
-+
-+#define       SPARC_PIC_THUNK(reg)    \
-+      .align  32;             \
-+.Lpic_thunk:                  \
-+      jmp     %o7 + 8;        \
-+      add     %o7, reg, reg;
-+
-+#define       SPARC_PIC_THUNK_CALL(reg)                       \
-+      sethi   %hi(_GLOBAL_OFFSET_TABLE_-4), reg;      \
-+      call    .Lpic_thunk;                            \
-+      or      reg, %lo(_GLOBAL_OFFSET_TABLE_+4), reg;
-+
-+#if 1
-+#define       SPARC_SETUP_GOT_REG(reg)        SPARC_PIC_THUNK_CALL(reg)
-+#else
-+#define       SPARC_SETUP_GOT_REG(reg)        \
-+      sethi   %hi(_GLOBAL_OFFSET_TABLE_-4), reg;      \
-+      call    .+8;                                    \
-+      or      reg, %lo(_GLOBAL_OFFSET_TABLE_+4), reg; \
-+      add     %o7, reg, reg
-+#endif
-+
-+#if defined(__arch64__)
-+
-+#define       SPARC_LOAD_ADDRESS(SYM, reg)    \
-+      setx    SYM, %o7, reg;
-+#define       LDPTR           ldx
-+#define       SIZE_T_CC       %xcc
-+#define       STACK_FRAME     192
-+#define       STACK_BIAS      2047
-+#define       STACK_7thARG    (STACK_BIAS+176)
-+
-+#else
-+
-+#define       SPARC_LOAD_ADDRESS(SYM, reg)    \
-+      set     SYM, reg;
-+#define       LDPTR           ld
-+#define       SIZE_T_CC       %icc
-+#define       STACK_FRAME     112
-+#define       STACK_BIAS      0
-+#define       STACK_7thARG    92
-+#define       SPARC_LOAD_ADDRESS_LEAF(SYM, reg, tmp) SPARC_LOAD_ADDRESS(SYM, 
reg)
-+
-+#endif
-+
-+#ifdef __PIC__
-+#undef        SPARC_LOAD_ADDRESS
-+#undef SPARC_LOAD_ADDRESS_LEAF
-+#define       SPARC_LOAD_ADDRESS(SYM, reg)    \
-+      SPARC_SETUP_GOT_REG(reg);       \
-+      sethi   %hi(SYM), %o7;          \
-+      or      %o7, %lo(SYM), %o7;     \
-+      LDPTR   [reg + %o7], reg;
-+#endif
-+
-+#ifndef SPARC_LOAD_ADDRESS_LEAF
-+#define       SPARC_LOAD_ADDRESS_LEAF(SYM, reg, tmp)  \
-+      mov     %o7, tmp;                       \
-+      SPARC_LOAD_ADDRESS(SYM, reg)            \
-+      mov     tmp, %o7;
-+#endif
-+
-+#endif        /* __SPARC_ARCH_H__ */
-Index: crypto/md5/asm/md5-sparcv9.pl
-===================================================================
-diff -uNr openssl-1.0.1m/crypto/md5/asm/md5-sparcv9.pl 
openssl-1.0.1m/crypto/md5/asm/md5-sparcv9.pl
---- openssl-1.0.1m/crypto/md5/asm/md5-sparcv9.pl 1970-01-01 01:00:00.000000000 
+0100
-+++ openssl-1.0.1m/crypto/md5/asm/md5-sparcv9.pl 2015-03-21 16:27:38.578043100 
+0100
-@@ -0,0 +1,434 @@
-+#!/usr/bin/env perl
-+
-+# ====================================================================
-+# Written by Andy Polyakov <[email protected]> for the OpenSSL
-+# project. The module is, however, dual licensed under OpenSSL and
-+# CRYPTOGAMS licenses depending on where you obtain it. For further
-+# details see http://www.openssl.org/~appro/cryptogams/.
-+#
-+# Hardware SPARC T4 support by David S. Miller <[email protected]>.
-+# ====================================================================
-+
-+# MD5 for SPARCv9, 6.9 cycles per byte on UltraSPARC, >40% faster than
-+# code generated by Sun C 5.2.
-+
-+# SPARC T4 MD5 hardware achieves 3.20 cycles per byte, which is 2.1x
-+# faster than software. Multi-process benchmark saturates at 12x
-+# single-process result on 8-core processor, or ~11GBps per 2.85GHz
-+# socket.
-+
-+$bits=32;
-+for (@ARGV)   { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
-+if ($bits==64)        { $bias=2047; $frame=192; }
-+else          { $bias=0;    $frame=112; }
-+
-+$output=shift;
-+open STDOUT,">$output";
-+
-+use integer;
-+
-+($ctx,$inp,$len)=("%i0","%i1","%i2"); # input arguments
-+
-+# 64-bit values
-+@X=("%o0","%o1","%o2","%o3","%o4","%o5","%o7","%g1","%g2");
-+$tx="%g3";
-+($AB,$CD)=("%g4","%g5");
-+
-+# 32-bit values
-+@V=($A,$B,$C,$D)=map("%l$_",(0..3));
-+($t1,$t2,$t3,$saved_asi)=map("%l$_",(4..7));
-+($shr,$shl1,$shl2)=("%i3","%i4","%i5");
-+
-+my @K=(       0xd76aa478,0xe8c7b756,0x242070db,0xc1bdceee,
-+      0xf57c0faf,0x4787c62a,0xa8304613,0xfd469501,
-+      0x698098d8,0x8b44f7af,0xffff5bb1,0x895cd7be,
-+      0x6b901122,0xfd987193,0xa679438e,0x49b40821,
-+
-+      0xf61e2562,0xc040b340,0x265e5a51,0xe9b6c7aa,
-+      0xd62f105d,0x02441453,0xd8a1e681,0xe7d3fbc8,
-+      0x21e1cde6,0xc33707d6,0xf4d50d87,0x455a14ed,
-+      0xa9e3e905,0xfcefa3f8,0x676f02d9,0x8d2a4c8a,
-+
-+      0xfffa3942,0x8771f681,0x6d9d6122,0xfde5380c,
-+      0xa4beea44,0x4bdecfa9,0xf6bb4b60,0xbebfbc70,
-+      0x289b7ec6,0xeaa127fa,0xd4ef3085,0x04881d05,
-+      0xd9d4d039,0xe6db99e5,0x1fa27cf8,0xc4ac5665,
-+
-+      0xf4292244,0x432aff97,0xab9423a7,0xfc93a039,
-+      0x655b59c3,0x8f0ccc92,0xffeff47d,0x85845dd1,
-+      0x6fa87e4f,0xfe2ce6e0,0xa3014314,0x4e0811a1,
-+      0xf7537e82,0xbd3af235,0x2ad7d2bb,0xeb86d391, 0  );
-+
-+sub R0 {
-+  my ($i,$a,$b,$c,$d) = @_;
-+  my $rot = (7,12,17,22)[$i%4];
-+  my $j   = ($i+1)/2;
-+
-+  if ($i&1) {
-+    $code.=<<___;
-+       srlx   @X[$j],$shr,@X[$j]      ! align X[`$i+1`]
-+      and     $b,$t1,$t1              ! round $i
-+       sllx   @X[$j+1],$shl1,$tx
-+      add     $t2,$a,$a
-+       sllx   $tx,$shl2,$tx
-+      xor     $d,$t1,$t1
-+       or     $tx,@X[$j],@X[$j]
-+       sethi  %hi(@K[$i+1]),$t2
-+      add     $t1,$a,$a
-+       or     $t2,%lo(@K[$i+1]),$t2
-+      sll     $a,$rot,$t3
-+       add    @X[$j],$t2,$t2          ! X[`$i+1`]+K[`$i+1`]
-+      srl     $a,32-$rot,$a
-+      add     $b,$t3,$t3
-+       xor     $b,$c,$t1
-+      add     $t3,$a,$a
-+___
-+  } else {
-+    $code.=<<___;
-+       srlx   @X[$j],32,$tx           ! extract X[`2*$j+1`]
-+      and     $b,$t1,$t1              ! round $i
-+      add     $t2,$a,$a
-+      xor     $d,$t1,$t1
-+       sethi  %hi(@K[$i+1]),$t2
-+      add     $t1,$a,$a
-+       or     $t2,%lo(@K[$i+1]),$t2
-+      sll     $a,$rot,$t3
-+       add    $tx,$t2,$t2             ! X[`2*$j+1`]+K[`$i+1`]
-+      srl     $a,32-$rot,$a
-+      add     $b,$t3,$t3
-+       xor     $b,$c,$t1
-+      add     $t3,$a,$a
-+___
-+  }
-+}
-+
-+sub R0_1 {
-+  my ($i,$a,$b,$c,$d) = @_;
-+  my $rot = (7,12,17,22)[$i%4];
-+
-+$code.=<<___;
-+       srlx   @X[0],32,$tx            ! extract X[1]
-+      and     $b,$t1,$t1              ! round $i
-+      add     $t2,$a,$a
-+      xor     $d,$t1,$t1
-+       sethi  %hi(@K[$i+1]),$t2
-+      add     $t1,$a,$a
-+       or     $t2,%lo(@K[$i+1]),$t2
-+      sll     $a,$rot,$t3
-+       add    $tx,$t2,$t2             ! X[1]+K[`$i+1`]
-+      srl     $a,32-$rot,$a
-+      add     $b,$t3,$t3
-+       andn    $b,$c,$t1
-+      add     $t3,$a,$a
-+___
-+}
-+
-+sub R1 {
-+  my ($i,$a,$b,$c,$d) = @_;
-+  my $rot = (5,9,14,20)[$i%4];
-+  my $j   = $i<31 ? (1+5*($i+1))%16 : (5+3*($i+1))%16;
-+  my $xi  = @X[$j/2];
-+
-+$code.=<<___ if ($j&1 && ($xi=$tx));
-+       srlx   @X[$j/2],32,$xi         ! extract X[$j]
-+___
-+$code.=<<___;
-+      and     $b,$d,$t3               ! round $i
-+      add     $t2,$a,$a
-+      or      $t3,$t1,$t1
-+       sethi  %hi(@K[$i+1]),$t2
-+      add     $t1,$a,$a
-+       or     $t2,%lo(@K[$i+1]),$t2
-+      sll     $a,$rot,$t3
-+       add    $xi,$t2,$t2             ! X[$j]+K[`$i+1`]
-+      srl     $a,32-$rot,$a
-+      add     $b,$t3,$t3
-+       `$i<31?"andn":"xor"`    $b,$c,$t1
-+      add     $t3,$a,$a
-+___
-+}
-+
-+sub R2 {
-+  my ($i,$a,$b,$c,$d) = @_;
-+  my $rot = (4,11,16,23)[$i%4];
-+  my $j   = $i<47 ? (5+3*($i+1))%16 : (0+7*($i+1))%16;
-+  my $xi  = @X[$j/2];
-+
-+$code.=<<___ if ($j&1 && ($xi=$tx));
-+       srlx   @X[$j/2],32,$xi         ! extract X[$j]
-+___
-+$code.=<<___;
-+      add     $t2,$a,$a               ! round $i
-+      xor     $b,$t1,$t1
-+       sethi  %hi(@K[$i+1]),$t2
-+      add     $t1,$a,$a
-+       or     $t2,%lo(@K[$i+1]),$t2
-+      sll     $a,$rot,$t3
-+       add    $xi,$t2,$t2             ! X[$j]+K[`$i+1`]
-+      srl     $a,32-$rot,$a
-+      add     $b,$t3,$t3
-+       xor     $b,$c,$t1
-+      add     $t3,$a,$a
-+___
-+}
-+
-+sub R3 {
-+  my ($i,$a,$b,$c,$d) = @_;
-+  my $rot = (6,10,15,21)[$i%4];
-+  my $j   = (0+7*($i+1))%16;
-+  my $xi  = @X[$j/2];
-+
-+$code.=<<___;
-+      add     $t2,$a,$a               ! round $i
-+___
-+$code.=<<___ if ($j&1 && ($xi=$tx));
-+       srlx   @X[$j/2],32,$xi         ! extract X[$j]
-+___
-+$code.=<<___;
-+      orn     $b,$d,$t1
-+       sethi  %hi(@K[$i+1]),$t2
-+      xor     $c,$t1,$t1
-+       or     $t2,%lo(@K[$i+1]),$t2
-+      add     $t1,$a,$a
-+      sll     $a,$rot,$t3
-+       add    $xi,$t2,$t2             ! X[$j]+K[`$i+1`]
-+      srl     $a,32-$rot,$a
-+      add     $b,$t3,$t3
-+      add     $t3,$a,$a
-+___
-+}
-+
-+$code.=<<___ if ($bits==64);
-+.register     %g2,#scratch
-+.register     %g3,#scratch
-+___
-+$code.=<<___;
-+#include "sparc_arch.h"
-+
-+.section      ".text",#alloc,#execinstr
-+
-+#ifdef __PIC__
-+SPARC_PIC_THUNK(%g1)
-+#endif
-+
-+.globl        md5_block_asm_data_order
-+.align        32
-+md5_block_asm_data_order:
-+      SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
-+      ld      [%g1+4],%g1             ! OPENSSL_sparcv9cap_P[1]
-+
-+      andcc   %g1, CFR_MD5, %g0
-+      be      .Lsoftware
-+      nop
-+
-+      mov     4, %g1
-+      andcc   %o1, 0x7, %g0
-+      lda     [%o0 + %g0]0x88, %f0            ! load context
-+      lda     [%o0 + %g1]0x88, %f1
-+      add     %o0, 8, %o0
-+      lda     [%o0 + %g0]0x88, %f2
-+      lda     [%o0 + %g1]0x88, %f3
-+      bne,pn  %icc, .Lhwunaligned
-+      sub     %o0, 8, %o0
-+
-+.Lhw_loop:
-+      ldd     [%o1 + 0x00], %f8
-+      ldd     [%o1 + 0x08], %f10
-+      ldd     [%o1 + 0x10], %f12
-+      ldd     [%o1 + 0x18], %f14
-+      ldd     [%o1 + 0x20], %f16
-+      ldd     [%o1 + 0x28], %f18
-+      ldd     [%o1 + 0x30], %f20
-+      subcc   %o2, 1, %o2             ! done yet? 
-+      ldd     [%o1 + 0x38], %f22
-+      add     %o1, 0x40, %o1
-+      prefetch [%o1 + 63], 20
-+
-+      .word   0x81b02800              ! MD5
-+
-+      bne,pt  `$bits==64?"%xcc":"%icc"`, .Lhw_loop
-+      nop
-+
-+.Lhwfinish:
-+      sta     %f0, [%o0 + %g0]0x88    ! store context
-+      sta     %f1, [%o0 + %g1]0x88
-+      add     %o0, 8, %o0
-+      sta     %f2, [%o0 + %g0]0x88
-+      sta     %f3, [%o0 + %g1]0x88
-+      retl
-+      nop
-+
-+.align        8
-+.Lhwunaligned:
-+      alignaddr %o1, %g0, %o1
-+
-+      ldd     [%o1 + 0x00], %f10
-+.Lhwunaligned_loop:
-+      ldd     [%o1 + 0x08], %f12
-+      ldd     [%o1 + 0x10], %f14
-+      ldd     [%o1 + 0x18], %f16
-+      ldd     [%o1 + 0x20], %f18
-+      ldd     [%o1 + 0x28], %f20
-+      ldd     [%o1 + 0x30], %f22
-+      ldd     [%o1 + 0x38], %f24
-+      subcc   %o2, 1, %o2             ! done yet?
-+      ldd     [%o1 + 0x40], %f26
-+      add     %o1, 0x40, %o1
-+      prefetch [%o1 + 63], 20
-+
-+      faligndata %f10, %f12, %f8
-+      faligndata %f12, %f14, %f10
-+      faligndata %f14, %f16, %f12
-+      faligndata %f16, %f18, %f14
-+      faligndata %f18, %f20, %f16
-+      faligndata %f20, %f22, %f18
-+      faligndata %f22, %f24, %f20
-+      faligndata %f24, %f26, %f22
-+
-+      .word   0x81b02800              ! MD5
-+
-+      bne,pt  `$bits==64?"%xcc":"%icc"`, .Lhwunaligned_loop
-+      for     %f26, %f26, %f10        ! %f10=%f26
-+
-+      ba      .Lhwfinish
-+      nop
-+
-+.align        16
-+.Lsoftware:
-+      save    %sp,-$frame,%sp
-+
-+      rd      %asi,$saved_asi
-+      wr      %g0,0x88,%asi           ! ASI_PRIMARY_LITTLE
-+      and     $inp,7,$shr
-+      andn    $inp,7,$inp
-+
-+      sll     $shr,3,$shr             ! *=8
-+      mov     56,$shl2
-+      ld      [$ctx+0],$A
-+      sub     $shl2,$shr,$shl2
-+      ld      [$ctx+4],$B
-+      and     $shl2,32,$shl1
-+      add     $shl2,8,$shl2
-+      ld      [$ctx+8],$C
-+      sub     $shl2,$shl1,$shl2       ! shr+shl1+shl2==64
-+      ld      [$ctx+12],$D
-+      nop
-+
-+.Loop:
-+       cmp    $shr,0                  ! was inp aligned?
-+      ldxa    [$inp+0]%asi,@X[0]      ! load little-endian input
-+      ldxa    [$inp+8]%asi,@X[1]
-+      ldxa    [$inp+16]%asi,@X[2]
-+      ldxa    [$inp+24]%asi,@X[3]
-+      ldxa    [$inp+32]%asi,@X[4]
-+       sllx   $A,32,$AB               ! pack A,B
-+      ldxa    [$inp+40]%asi,@X[5]
-+       sllx   $C,32,$CD               ! pack C,D
-+      ldxa    [$inp+48]%asi,@X[6]
-+       or     $B,$AB,$AB
-+      ldxa    [$inp+56]%asi,@X[7]
-+       or     $D,$CD,$CD
-+      bnz,a,pn        %icc,.+8
-+      ldxa    [$inp+64]%asi,@X[8]
-+
-+      srlx    @X[0],$shr,@X[0]        ! align X[0]
-+      sllx    @X[1],$shl1,$tx
-+       sethi  %hi(@K[0]),$t2
-+      sllx    $tx,$shl2,$tx
-+       or     $t2,%lo(@K[0]),$t2
-+      or      $tx,@X[0],@X[0]
-+       xor    $C,$D,$t1
-+       add    @X[0],$t2,$t2           ! X[0]+K[0]
-+___
-+      for ($i=0;$i<15;$i++)   { &R0($i,@V);   unshift(@V,pop(@V)); }
-+      for (;$i<16;$i++)       { &R0_1($i,@V); unshift(@V,pop(@V)); }
-+      for (;$i<32;$i++)       { &R1($i,@V);   unshift(@V,pop(@V)); }
-+      for (;$i<48;$i++)       { &R2($i,@V);   unshift(@V,pop(@V)); }
-+      for (;$i<64;$i++)       { &R3($i,@V);   unshift(@V,pop(@V)); }
-+$code.=<<___;
-+      srlx    $AB,32,$t1              ! unpack A,B,C,D and accumulate
-+      add     $inp,64,$inp            ! advance inp
-+      srlx    $CD,32,$t2
-+      add     $t1,$A,$A
-+      subcc   $len,1,$len             ! done yet?
-+      add     $AB,$B,$B
-+      add     $t2,$C,$C
-+      add     $CD,$D,$D
-+      srl     $B,0,$B                 ! clruw $B
-+      bne     `$bits==64?"%xcc":"%icc"`,.Loop
-+      srl     $D,0,$D                 ! clruw $D
-+
-+      st      $A,[$ctx+0]             ! write out ctx
-+      st      $B,[$ctx+4]
-+      st      $C,[$ctx+8]
-+      st      $D,[$ctx+12]
-+
-+      wr      %g0,$saved_asi,%asi
-+      ret
-+      restore
-+.type md5_block_asm_data_order,#function
-+.size md5_block_asm_data_order,(.-md5_block_asm_data_order)
-+
-+.asciz        "MD5 block transform for SPARCv9, CRYPTOGAMS by 
<appro\@openssl.org>"
-+.align        4
-+___
-+
-+# Purpose of these subroutines is to explicitly encode VIS instructions,
-+# so that one can compile the module without having to specify VIS
-+# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
-+# Idea is to reserve for option to produce "universal" binary and let
-+# programmer detect if current CPU is VIS capable at run-time.
-+sub unvis {
-+my ($mnemonic,$rs1,$rs2,$rd)=@_;
-+my $ref,$opf;
-+my %visopf = (        "faligndata"    => 0x048,
-+              "for"           => 0x07c        );
-+
-+    $ref = "$mnemonic\t$rs1,$rs2,$rd";
-+
-+    if ($opf=$visopf{$mnemonic}) {
-+      foreach ($rs1,$rs2,$rd) {
-+          return $ref if (!/%f([0-9]{1,2})/);
-+          $_=$1;
-+          if ($1>=32) {
-+              return $ref if ($1&1);
-+              # re-encode for upper double register addressing
-+              $_=($1|$1>>5)&31;
-+          }
-+      }
-+
-+      return  sprintf ".word\t0x%08x !%s",
-+                      0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
-+                      $ref;
-+    } else {
-+      return $ref;
-+    }
-+}
-+sub unalignaddr {
-+my ($mnemonic,$rs1,$rs2,$rd)=@_;
-+my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
-+my $ref="$mnemonic\t$rs1,$rs2,$rd";
-+
-+    foreach ($rs1,$rs2,$rd) {
-+      if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
-+      else                    { return $ref; }
-+    }
-+    return  sprintf ".word\t0x%08x !%s",
-+                  0x81b00300|$rd<<25|$rs1<<14|$rs2,
-+                  $ref;
-+}
-+
-+foreach (split("\n",$code)) {
-+      s/\`([^\`]*)\`/eval $1/ge;
-+
-+      s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
-+              &unvis($1,$2,$3,$4)
-+       /ge;
-+      s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
-+              &unalignaddr($1,$2,$3,$4)
-+       /ge;
-+
-+      print $_,"\n";
-+}
-+
-+close STDOUT;
-Index: crypto/aes/asm/aest4-sparcv9.pl
-===================================================================
-diff -uNr openssl-1.0.1m/crypto/aes/asm/aest4-sparcv9.pl 
openssl-1.0.1m/crypto/aes/asm/aest4-sparcv9.pl
---- openssl-1.0.1m/crypto/aes/asm/aest4-sparcv9.pl 1970-01-01 
01:00:00.000000000 +0100
-+++ openssl-1.0.1m/crypto/aes/asm/aest4-sparcv9.pl 2015-03-21 
16:27:38.578043100 +0100
-@@ -0,0 +1,902 @@
-+#!/usr/bin/env perl
-+
-+# ====================================================================
-+# Written by David S. Miller <[email protected]> and Andy Polyakov
-+# <[email protected]>. The module is licensed under 2-clause BSD
-+# license. October 2012. All rights reserved.
-+# ====================================================================
-+
-+######################################################################
-+# AES for SPARC T4.
-+#
-+# AES round instructions complete in 3 cycles and can be issued every
-+# cycle. It means that round calculations should take 4*rounds cycles,
-+# because any given round instruction depends on result of *both*
-+# previous instructions:
-+#
-+#     |0 |1 |2 |3 |4
-+#     |01|01|01|
-+#        |23|23|23|
-+#                 |01|01|...
-+#                    |23|...
-+#
-+# Provided that fxor [with IV] takes 3 cycles to complete, critical
-+# path length for CBC encrypt would be 3+4*rounds, or in other words
-+# it should process one byte in at least (3+4*rounds)/16 cycles. This
-+# estimate doesn't account for "collateral" instructions, such as
-+# fetching input from memory, xor-ing it with zero-round key and
-+# storing the result. Yet, *measured* performance [for data aligned
-+# at 64-bit boundary!] deviates from this equation by less than 0.5%:
-+#
-+#             128-bit key     192-            256-
-+# CBC encrypt 2.70/2.90(*)    3.20/3.40       3.70/3.90
-+#                      (*) numbers after slash are for
-+#                          misaligned data.
-+#
-+# Out-of-order execution logic managed to fully overlap "collateral"
-+# instructions with those on critical path. Amazing!
-+#
-+# As with Intel AES-NI, question is if it's possible to improve
-+# performance of parallelizeable modes by interleaving round
-+# instructions. Provided round instruction latency and throughput
-+# optimal interleave factor is 2. But can we expect 2x performance
-+# improvement? Well, as round instructions can be issued one per
-+# cycle, they don't saturate the 2-way issue pipeline and therefore
-+# there is room for "collateral" calculations... Yet, 2x speed-up
-+# over CBC encrypt remains unattaintable:
-+#
-+#             128-bit key     192-            256-
-+# CBC decrypt 1.64/2.11       1.89/2.37       2.23/2.61
-+# CTR         1.64/2.08(*)    1.89/2.33       2.23/2.61
-+#                      (*) numbers after slash are for
-+#                          misaligned data.
-+#
-+# Estimates based on amount of instructions under assumption that
-+# round instructions are not pairable with any other instruction
-+# suggest that latter is the actual case and pipeline runs
-+# underutilized. It should be noted that T4 out-of-order execution
-+# logic is so capable that performance gain from 2x interleave is
-+# not even impressive, ~7-13% over non-interleaved code, largest
-+# for 256-bit keys.
-+
-+# To anchor to something else, software implementation processes
-+# one byte in 29 cycles with 128-bit key on same processor. Intel
-+# Sandy Bridge encrypts byte in 5.07 cycles in CBC mode and decrypts
-+# in 0.93, naturally with AES-NI.
-+
-+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-+push(@INC,"${dir}","${dir}../../perlasm");
-+require "sparcv9_modes.pl";
-+
-+&asm_init(@ARGV);
-+
-+$::evp=1;     # if $evp is set to 0, script generates module with
-+# AES_[en|de]crypt, AES_set_[en|de]crypt_key and AES_cbc_encrypt entry
-+# points. These however are not fully compatible with openssl/aes.h,
-+# because they expect AES_KEY to be aligned at 64-bit boundary. When
-+# used through EVP, alignment is arranged at EVP layer. Second thing
-+# that is arranged by EVP is at least 32-bit alignment of IV.
-+
-+######################################################################
-+# single-round subroutines
-+#
-+{
-+my ($inp,$out,$key,$rounds,$tmp,$mask)=map("%o$_",(0..5));
-+
-+$code=<<___;
-+.text
-+
-+.globl        aes_t4_encrypt
-+.align        32
-+aes_t4_encrypt:
-+      andcc           $inp, 7, %g1            ! is input aligned?
-+      andn            $inp, 7, $inp
-+
-+      ldx             [$key + 0], %g4
-+      ldx             [$key + 8], %g5
-+
-+      ldx             [$inp + 0], %o4
-+      bz,pt           %icc, 1f
-+      ldx             [$inp + 8], %o5
-+      ldx             [$inp + 16], $inp
-+      sll             %g1, 3, %g1
-+      sub             %g0, %g1, %o3
-+      sllx            %o4, %g1, %o4
-+      sllx            %o5, %g1, %g1
-+      srlx            %o5, %o3, %o5
-+      srlx            $inp, %o3, %o3
-+      or              %o5, %o4, %o4
-+      or              %o3, %g1, %o5
-+1:
-+      ld              [$key + 240], $rounds
-+      ldd             [$key + 16], %f12
-+      ldd             [$key + 24], %f14
-+      xor             %g4, %o4, %o4
-+      xor             %g5, %o5, %o5
-+      movxtod         %o4, %f0
-+      movxtod         %o5, %f2
-+      srl             $rounds, 1, $rounds
-+      ldd             [$key + 32], %f16
-+      sub             $rounds, 1, $rounds
-+      ldd             [$key + 40], %f18
-+      add             $key, 48, $key
-+
-+.Lenc:
-+      aes_eround01    %f12, %f0, %f2, %f4
-+      aes_eround23    %f14, %f0, %f2, %f2
-+      ldd             [$key + 0], %f12
-+      ldd             [$key + 8], %f14
-+      sub             $rounds,1,$rounds
-+      aes_eround01    %f16, %f4, %f2, %f0
-+      aes_eround23    %f18, %f4, %f2, %f2
-+      ldd             [$key + 16], %f16
-+      ldd             [$key + 24], %f18
-+      brnz,pt         $rounds, .Lenc
-+      add             $key, 32, $key
-+
-+      andcc           $out, 7, $tmp           ! is output aligned?
-+      aes_eround01    %f12, %f0, %f2, %f4
-+      aes_eround23    %f14, %f0, %f2, %f2
-+      aes_eround01_l  %f16, %f4, %f2, %f0
-+      aes_eround23_l  %f18, %f4, %f2, %f2
-+
-+      bnz,pn          %icc, 2f
-+      nop
-+
-+      std             %f0, [$out + 0]
-+      retl
-+      std             %f2, [$out + 8]
-+
-+2:    alignaddrl      $out, %g0, $out
-+      mov             0xff, $mask
-+      srl             $mask, $tmp, $mask
-+
-+      faligndata      %f0, %f0, %f4
-+      faligndata      %f0, %f2, %f6
-+      faligndata      %f2, %f2, %f8
-+
-+      stda            %f4, [$out + $mask]0xc0 ! partial store
-+      std             %f6, [$out + 8]
-+      add             $out, 16, $out
-+      orn             %g0, $mask, $mask
-+      retl
-+      stda            %f8, [$out + $mask]0xc0 ! partial store
-+.type aes_t4_encrypt,#function
-+.size aes_t4_encrypt,.-aes_t4_encrypt
-+
-+.globl        aes_t4_decrypt
-+.align        32
-+aes_t4_decrypt:
-+      andcc           $inp, 7, %g1            ! is input aligned?
-+      andn            $inp, 7, $inp
-+
-+      ldx             [$key + 0], %g4
-+      ldx             [$key + 8], %g5
-+
-+      ldx             [$inp + 0], %o4
-+      bz,pt           %icc, 1f
-+      ldx             [$inp + 8], %o5
-+      ldx             [$inp + 16], $inp
-+      sll             %g1, 3, %g1
-+      sub             %g0, %g1, %o3
-+      sllx            %o4, %g1, %o4
-+      sllx            %o5, %g1, %g1
-+      srlx            %o5, %o3, %o5
-+      srlx            $inp, %o3, %o3
-+      or              %o5, %o4, %o4
-+      or              %o3, %g1, %o5
-+1:
-+      ld              [$key + 240], $rounds
-+      ldd             [$key + 16], %f12
-+      ldd             [$key + 24], %f14
-+      xor             %g4, %o4, %o4
-+      xor             %g5, %o5, %o5
-+      movxtod         %o4, %f0
-+      movxtod         %o5, %f2
-+      srl             $rounds, 1, $rounds
-+      ldd             [$key + 32], %f16
-+      sub             $rounds, 1, $rounds
-+      ldd             [$key + 40], %f18
-+      add             $key, 48, $key
-+
-+.Ldec:
-+      aes_dround01    %f12, %f0, %f2, %f4
-+      aes_dround23    %f14, %f0, %f2, %f2
-+      ldd             [$key + 0], %f12
-+      ldd             [$key + 8], %f14
-+      sub             $rounds,1,$rounds
-+      aes_dround01    %f16, %f4, %f2, %f0
-+      aes_dround23    %f18, %f4, %f2, %f2
-+      ldd             [$key + 16], %f16
-+      ldd             [$key + 24], %f18
-+      brnz,pt         $rounds, .Ldec
-+      add             $key, 32, $key
-+
-+      andcc           $out, 7, $tmp           ! is output aligned?
-+      aes_dround01    %f12, %f0, %f2, %f4
-+      aes_dround23    %f14, %f0, %f2, %f2
-+      aes_dround01_l  %f16, %f4, %f2, %f0
-+      aes_dround23_l  %f18, %f4, %f2, %f2
-+
-+      bnz,pn          %icc, 2f
-+      nop
-+
-+      std             %f0, [$out + 0]
-+      retl
-+      std             %f2, [$out + 8]
-+
-+2:    alignaddrl      $out, %g0, $out
-+      mov             0xff, $mask
-+      srl             $mask, $tmp, $mask
-+
-+      faligndata      %f0, %f0, %f4
-+      faligndata      %f0, %f2, %f6
-+      faligndata      %f2, %f2, %f8
-+
-+      stda            %f4, [$out + $mask]0xc0 ! partial store
-+      std             %f6, [$out + 8]
-+      add             $out, 16, $out
-+      orn             %g0, $mask, $mask
-+      retl
-+      stda            %f8, [$out + $mask]0xc0 ! partial store
-+.type aes_t4_decrypt,#function
-+.size aes_t4_decrypt,.-aes_t4_decrypt
-+___
-+}
-+
-+######################################################################
-+# key setup subroutines
-+#
-+{
-+my ($inp,$bits,$out,$tmp)=map("%o$_",(0..5));
-+$code.=<<___;
-+.globl        aes_t4_set_encrypt_key
-+.align        32
-+aes_t4_set_encrypt_key:
-+.Lset_encrypt_key:
-+      and             $inp, 7, $tmp
-+      alignaddr       $inp, %g0, $inp
-+      cmp             $bits, 192
-+      ldd             [$inp + 0], %f0
-+      bl,pt           %icc,.L128
-+      ldd             [$inp + 8], %f2
-+
-+      be,pt           %icc,.L192
-+      ldd             [$inp + 16], %f4
-+      brz,pt          $tmp, .L256aligned
-+      ldd             [$inp + 24], %f6
-+
-+      ldd             [$inp + 32], %f8
-+      faligndata      %f0, %f2, %f0
-+      faligndata      %f2, %f4, %f2
-+      faligndata      %f4, %f6, %f4
-+      faligndata      %f6, %f8, %f6
-+.L256aligned:
-+___
-+for ($i=0; $i<6; $i++) {
-+    $code.=<<___;
-+      std             %f0, [$out + `32*$i+0`]
-+      aes_kexpand1    %f0, %f6, $i, %f0
-+      std             %f2, [$out + `32*$i+8`]
-+      aes_kexpand2    %f2, %f0, %f2
-+      std             %f4, [$out + `32*$i+16`]
-+      aes_kexpand0    %f4, %f2, %f4
-+      std             %f6, [$out + `32*$i+24`]
-+      aes_kexpand2    %f6, %f4, %f6
-+___
-+}
-+$code.=<<___;
-+      std             %f0, [$out + `32*$i+0`]
-+      aes_kexpand1    %f0, %f6, $i, %f0
-+      std             %f2, [$out + `32*$i+8`]
-+      aes_kexpand2    %f2, %f0, %f2
-+      std             %f4, [$out + `32*$i+16`]
-+      std             %f6, [$out + `32*$i+24`]
-+      std             %f0, [$out + `32*$i+32`]
-+      std             %f2, [$out + `32*$i+40`]
-+
-+      mov             14, $tmp
-+      st              $tmp, [$out + 240]
-+      retl
-+      xor             %o0, %o0, %o0
-+
-+.align        16
-+.L192:
-+      brz,pt          $tmp, .L192aligned
-+      nop
-+
-+      ldd             [$inp + 24], %f6
-+      faligndata      %f0, %f2, %f0
-+      faligndata      %f2, %f4, %f2
-+      faligndata      %f4, %f6, %f4
-+.L192aligned:
-+___
-+for ($i=0; $i<7; $i++) {
-+    $code.=<<___;
-+      std             %f0, [$out + `24*$i+0`]
-+      aes_kexpand1    %f0, %f4, $i, %f0
-+      std             %f2, [$out + `24*$i+8`]
-+      aes_kexpand2    %f2, %f0, %f2
-+      std             %f4, [$out + `24*$i+16`]
-+      aes_kexpand2    %f4, %f2, %f4
-+___
-+}
-+$code.=<<___;
-+      std             %f0, [$out + `24*$i+0`]
-+      aes_kexpand1    %f0, %f4, $i, %f0
-+      std             %f2, [$out + `24*$i+8`]
-+      aes_kexpand2    %f2, %f0, %f2
-+      std             %f4, [$out + `24*$i+16`]
-+      std             %f0, [$out + `24*$i+24`]
-+      std             %f2, [$out + `24*$i+32`]
-+
-+      mov             12, $tmp
-+      st              $tmp, [$out + 240]
-+      retl
-+      xor             %o0, %o0, %o0
-+
-+.align        16
-+.L128:
-+      brz,pt          $tmp, .L128aligned
-+      nop
-+
-+      ldd             [$inp + 16], %f4
-+      faligndata      %f0, %f2, %f0
-+      faligndata      %f2, %f4, %f2
-+.L128aligned:
-+___
-+for ($i=0; $i<10; $i++) {
-+    $code.=<<___;
-+      std             %f0, [$out + `16*$i+0`]
-+      aes_kexpand1    %f0, %f2, $i, %f0
-+      std             %f2, [$out + `16*$i+8`]
-+      aes_kexpand2    %f2, %f0, %f2
-+___
-+}
-+$code.=<<___;
-+      std             %f0, [$out + `16*$i+0`]
-+      std             %f2, [$out + `16*$i+8`]
-+
-+      mov             10, $tmp
-+      st              $tmp, [$out + 240]
-+      retl
-+      xor             %o0, %o0, %o0
-+.type aes_t4_set_encrypt_key,#function
-+.size aes_t4_set_encrypt_key,.-aes_t4_set_encrypt_key
-+
-+.globl        aes_t4_set_decrypt_key
-+.align        32
-+aes_t4_set_decrypt_key:
-+      mov             %o7, %o5
-+      call            .Lset_encrypt_key
-+      nop
-+
-+      mov             %o5, %o7
-+      sll             $tmp, 4, $inp           ! $tmp is number of rounds
-+      add             $tmp, 2, $tmp
-+      add             $out, $inp, $inp        ! $inp=$out+16*rounds
-+      srl             $tmp, 2, $tmp           ! $tmp=(rounds+2)/4
-+
-+.Lkey_flip:
-+      ldd             [$out + 0],  %f0
-+      ldd             [$out + 8],  %f2
-+      ldd             [$out + 16], %f4
-+      ldd             [$out + 24], %f6
-+      ldd             [$inp + 0],  %f8
-+      ldd             [$inp + 8],  %f10
-+      ldd             [$inp - 16], %f12
-+      ldd             [$inp - 8],  %f14
-+      sub             $tmp, 1, $tmp
-+      std             %f0, [$inp + 0]
-+      std             %f2, [$inp + 8]
-+      std             %f4, [$inp - 16]
-+      std             %f6, [$inp - 8]
-+      std             %f8, [$out + 0]
-+      std             %f10, [$out + 8]
-+      std             %f12, [$out + 16]
-+      std             %f14, [$out + 24]
-+      add             $out, 32, $out
-+      brnz            $tmp, .Lkey_flip
-+      sub             $inp, 32, $inp
-+
-+      retl
-+      xor             %o0, %o0, %o0
-+.type aes_t4_set_decrypt_key,#function
-+.size aes_t4_set_decrypt_key,.-aes_t4_set_decrypt_key
-+___
-+}
-+
-+{{{
-+my ($inp,$out,$len,$key,$ivec,$enc)=map("%i$_",(0..5));
-+my ($ileft,$iright,$ooff,$omask,$ivoff)=map("%l$_",(1..7));
-+
-+$code.=<<___;
-+.align        32
-+_aes128_loadkey:
-+      ldx             [$key + 0], %g4
-+      ldx             [$key + 8], %g5
-+___
-+for ($i=2; $i<22;$i++) {                      # load key schedule
-+    $code.=<<___;
-+      ldd             [$key + `8*$i`], %f`12+2*$i`
-+___
-+}
-+$code.=<<___;
-+      retl
-+      nop
-+.type _aes128_loadkey,#function
-+.size _aes128_loadkey,.-_aes128_loadkey
-+_aes128_load_enckey=_aes128_loadkey
-+_aes128_load_deckey=_aes128_loadkey
-+
-+.align        32
-+_aes128_encrypt_1x:
-+___
-+for ($i=0; $i<4; $i++) {
-+    $code.=<<___;
-+      aes_eround01    %f`16+8*$i+0`, %f0, %f2, %f4
-+      aes_eround23    %f`16+8*$i+2`, %f0, %f2, %f2
-+      aes_eround01    %f`16+8*$i+4`, %f4, %f2, %f0
-+      aes_eround23    %f`16+8*$i+6`, %f4, %f2, %f2
-+___
-+}
-+$code.=<<___;
-+      aes_eround01    %f48, %f0, %f2, %f4
-+      aes_eround23    %f50, %f0, %f2, %f2
-+      aes_eround01_l  %f52, %f4, %f2, %f0
-+      retl
-+      aes_eround23_l  %f54, %f4, %f2, %f2
-+.type _aes128_encrypt_1x,#function
-+.size _aes128_encrypt_1x,.-_aes128_encrypt_1x
-+
-+.align        32
-+_aes128_encrypt_2x:
-+___
-+for ($i=0; $i<4; $i++) {
-+    $code.=<<___;
-+      aes_eround01    %f`16+8*$i+0`, %f0, %f2, %f8
-+      aes_eround23    %f`16+8*$i+2`, %f0, %f2, %f2
-+      aes_eround01    %f`16+8*$i+0`, %f4, %f6, %f10
-+      aes_eround23    %f`16+8*$i+2`, %f4, %f6, %f6
-+      aes_eround01    %f`16+8*$i+4`, %f8, %f2, %f0
-+      aes_eround23    %f`16+8*$i+6`, %f8, %f2, %f2
-+      aes_eround01    %f`16+8*$i+4`, %f10, %f6, %f4
-+      aes_eround23    %f`16+8*$i+6`, %f10, %f6, %f6
-+___
-+}
-+$code.=<<___;
-+      aes_eround01    %f48, %f0, %f2, %f8
-+      aes_eround23    %f50, %f0, %f2, %f2
-+      aes_eround01    %f48, %f4, %f6, %f10
-+      aes_eround23    %f50, %f4, %f6, %f6
-+      aes_eround01_l  %f52, %f8, %f2, %f0
-+      aes_eround23_l  %f54, %f8, %f2, %f2
-+      aes_eround01_l  %f52, %f10, %f6, %f4
-+      retl
-+      aes_eround23_l  %f54, %f10, %f6, %f6
-+.type _aes128_encrypt_2x,#function
-+.size _aes128_encrypt_2x,.-_aes128_encrypt_2x
-+
-+.align        32
-+_aes128_decrypt_1x:
-+___
-+for ($i=0; $i<4; $i++) {
-+    $code.=<<___;
-+      aes_dround01    %f`16+8*$i+0`, %f0, %f2, %f4
-+      aes_dround23    %f`16+8*$i+2`, %f0, %f2, %f2
-+      aes_dround01    %f`16+8*$i+4`, %f4, %f2, %f0
-+      aes_dround23    %f`16+8*$i+6`, %f4, %f2, %f2
-+___
-+}
-+$code.=<<___;
-+      aes_dround01    %f48, %f0, %f2, %f4
-+      aes_dround23    %f50, %f0, %f2, %f2
-+      aes_dround01_l  %f52, %f4, %f2, %f0
-+      retl
-+      aes_dround23_l  %f54, %f4, %f2, %f2
-+.type _aes128_decrypt_1x,#function
-+.size _aes128_decrypt_1x,.-_aes128_decrypt_1x
-+
-+.align        32
-+_aes128_decrypt_2x:
-+___
-+for ($i=0; $i<4; $i++) {
-+    $code.=<<___;
-+      aes_dround01    %f`16+8*$i+0`, %f0, %f2, %f8
-+      aes_dround23    %f`16+8*$i+2`, %f0, %f2, %f2
-+      aes_dround01    %f`16+8*$i+0`, %f4, %f6, %f10
-+      aes_dround23    %f`16+8*$i+2`, %f4, %f6, %f6
-+      aes_dround01    %f`16+8*$i+4`, %f8, %f2, %f0
-+      aes_dround23    %f`16+8*$i+6`, %f8, %f2, %f2
-+      aes_dround01    %f`16+8*$i+4`, %f10, %f6, %f4
-+      aes_dround23    %f`16+8*$i+6`, %f10, %f6, %f6
-+___
-+}
-+$code.=<<___;
-+      aes_dround01    %f48, %f0, %f2, %f8
-+      aes_dround23    %f50, %f0, %f2, %f2
-+      aes_dround01    %f48, %f4, %f6, %f10
-+      aes_dround23    %f50, %f4, %f6, %f6
-+      aes_dround01_l  %f52, %f8, %f2, %f0
-+      aes_dround23_l  %f54, %f8, %f2, %f2
-+      aes_dround01_l  %f52, %f10, %f6, %f4
-+      retl
-+      aes_dround23_l  %f54, %f10, %f6, %f6
-+.type _aes128_decrypt_2x,#function
-+.size _aes128_decrypt_2x,.-_aes128_decrypt_2x
-+
-+.align        32
-+_aes192_loadkey:
-+_aes256_loadkey:
-+      ldx             [$key + 0], %g4
-+      ldx             [$key + 8], %g5
-+___
-+for ($i=2; $i<26;$i++) {                      # load key schedule
-+    $code.=<<___;
-+      ldd             [$key + `8*$i`], %f`12+2*$i`
-+___
-+}
-+$code.=<<___;
-+      retl
-+      nop
-+.type _aes192_loadkey,#function
-+.size _aes192_loadkey,.-_aes192_loadkey
-+_aes192_load_enckey=_aes192_loadkey
-+_aes192_load_deckey=_aes192_loadkey
-+_aes256_load_enckey=_aes192_loadkey
-+_aes256_load_deckey=_aes192_loadkey
-+
-+.align        32
-+_aes192_encrypt_1x:
-+___
-+for ($i=0; $i<5; $i++) {
-+    $code.=<<___;
-+      aes_eround01    %f`16+8*$i+0`, %f0, %f2, %f4
-+      aes_eround23    %f`16+8*$i+2`, %f0, %f2, %f2
-+      aes_eround01    %f`16+8*$i+4`, %f4, %f2, %f0
-+      aes_eround23    %f`16+8*$i+6`, %f4, %f2, %f2
-+___
-+}
-+$code.=<<___;
-+      aes_eround01    %f56, %f0, %f2, %f4
-+      aes_eround23    %f58, %f0, %f2, %f2
-+      aes_eround01_l  %f60, %f4, %f2, %f0
-+      retl
-+      aes_eround23_l  %f62, %f4, %f2, %f2
-+.type _aes192_encrypt_1x,#function
-+.size _aes192_encrypt_1x,.-_aes192_encrypt_1x
-+
-+.align        32
-+_aes192_encrypt_2x:
-+___
-+for ($i=0; $i<5; $i++) {
-+    $code.=<<___;
-+      aes_eround01    %f`16+8*$i+0`, %f0, %f2, %f8
-+      aes_eround23    %f`16+8*$i+2`, %f0, %f2, %f2
-+      aes_eround01    %f`16+8*$i+0`, %f4, %f6, %f10
-+      aes_eround23    %f`16+8*$i+2`, %f4, %f6, %f6
-+      aes_eround01    %f`16+8*$i+4`, %f8, %f2, %f0
-+      aes_eround23    %f`16+8*$i+6`, %f8, %f2, %f2
-+      aes_eround01    %f`16+8*$i+4`, %f10, %f6, %f4
-+      aes_eround23    %f`16+8*$i+6`, %f10, %f6, %f6
-+___
-+}
-+$code.=<<___;
-+      aes_eround01    %f56, %f0, %f2, %f8
-+      aes_eround23    %f58, %f0, %f2, %f2
-+      aes_eround01    %f56, %f4, %f6, %f10
-+      aes_eround23    %f58, %f4, %f6, %f6
-+      aes_eround01_l  %f60, %f8, %f2, %f0
-+      aes_eround23_l  %f62, %f8, %f2, %f2
-+      aes_eround01_l  %f60, %f10, %f6, %f4
-+      retl
-+      aes_eround23_l  %f62, %f10, %f6, %f6
-+.type _aes192_encrypt_2x,#function
-+.size _aes192_encrypt_2x,.-_aes192_encrypt_2x
-+
-+.align        32
-+_aes192_decrypt_1x:
-+___
-+for ($i=0; $i<5; $i++) {
-+    $code.=<<___;
-+      aes_dround01    %f`16+8*$i+0`, %f0, %f2, %f4
-+      aes_dround23    %f`16+8*$i+2`, %f0, %f2, %f2
-+      aes_dround01    %f`16+8*$i+4`, %f4, %f2, %f0
-+      aes_dround23    %f`16+8*$i+6`, %f4, %f2, %f2
-+___
-+}
-+$code.=<<___;
-+      aes_dround01    %f56, %f0, %f2, %f4
-+      aes_dround23    %f58, %f0, %f2, %f2
-+      aes_dround01_l  %f60, %f4, %f2, %f0
-+      retl
-+      aes_dround23_l  %f62, %f4, %f2, %f2
-+.type _aes192_decrypt_1x,#function
-+.size _aes192_decrypt_1x,.-_aes192_decrypt_1x
-+
-+.align        32
-+_aes192_decrypt_2x:
-+___
-+for ($i=0; $i<5; $i++) {
-+    $code.=<<___;
-+      aes_dround01    %f`16+8*$i+0`, %f0, %f2, %f8
-+      aes_dround23    %f`16+8*$i+2`, %f0, %f2, %f2
-+      aes_dround01    %f`16+8*$i+0`, %f4, %f6, %f10
-+      aes_dround23    %f`16+8*$i+2`, %f4, %f6, %f6
-+      aes_dround01    %f`16+8*$i+4`, %f8, %f2, %f0
-+      aes_dround23    %f`16+8*$i+6`, %f8, %f2, %f2
-+      aes_dround01    %f`16+8*$i+4`, %f10, %f6, %f4
-+      aes_dround23    %f`16+8*$i+6`, %f10, %f6, %f6
-+___
-+}
-+$code.=<<___;
-+      aes_dround01    %f56, %f0, %f2, %f8
-+      aes_dround23    %f58, %f0, %f2, %f2
-+      aes_dround01    %f56, %f4, %f6, %f10
-+      aes_dround23    %f58, %f4, %f6, %f6
-+      aes_dround01_l  %f60, %f8, %f2, %f0
-+      aes_dround23_l  %f62, %f8, %f2, %f2
-+      aes_dround01_l  %f60, %f10, %f6, %f4
-+      retl
-+      aes_dround23_l  %f62, %f10, %f6, %f6
-+.type _aes192_decrypt_2x,#function
-+.size _aes192_decrypt_2x,.-_aes192_decrypt_2x
-+
-+.align        32
-+_aes256_encrypt_1x:
-+      aes_eround01    %f16, %f0, %f2, %f4
-+      aes_eround23    %f18, %f0, %f2, %f2
-+      ldd             [$key + 208], %f16
-+      ldd             [$key + 216], %f18
-+      aes_eround01    %f20, %f4, %f2, %f0
-+      aes_eround23    %f22, %f4, %f2, %f2
-+      ldd             [$key + 224], %f20
-+      ldd             [$key + 232], %f22
-+___
-+for ($i=1; $i<6; $i++) {
-+    $code.=<<___;
-+      aes_eround01    %f`16+8*$i+0`, %f0, %f2, %f4
-+      aes_eround23    %f`16+8*$i+2`, %f0, %f2, %f2
-+      aes_eround01    %f`16+8*$i+4`, %f4, %f2, %f0
-+      aes_eround23    %f`16+8*$i+6`, %f4, %f2, %f2
-+___
-+}
-+$code.=<<___;
-+      aes_eround01    %f16, %f0, %f2, %f4
-+      aes_eround23    %f18, %f0, %f2, %f2
-+      ldd             [$key + 16], %f16
-+      ldd             [$key + 24], %f18
-+      aes_eround01_l  %f20, %f4, %f2, %f0
-+      aes_eround23_l  %f22, %f4, %f2, %f2
-+      ldd             [$key + 32], %f20
-+      retl
-+      ldd             [$key + 40], %f22
-+.type _aes256_encrypt_1x,#function
-+.size _aes256_encrypt_1x,.-_aes256_encrypt_1x
-+
-+.align        32
-+_aes256_encrypt_2x:
-+      aes_eround01    %f16, %f0, %f2, %f8
-+      aes_eround23    %f18, %f0, %f2, %f2
-+      aes_eround01    %f16, %f4, %f6, %f10
-+      aes_eround23    %f18, %f4, %f6, %f6
-+      ldd             [$key + 208], %f16
-+      ldd             [$key + 216], %f18
-+      aes_eround01    %f20, %f8, %f2, %f0
-+      aes_eround23    %f22, %f8, %f2, %f2
-+      aes_eround01    %f20, %f10, %f6, %f4
-+      aes_eround23    %f22, %f10, %f6, %f6
-+      ldd             [$key + 224], %f20
-+      ldd             [$key + 232], %f22
-+___
-+for ($i=1; $i<6; $i++) {
-+    $code.=<<___;
-+      aes_eround01    %f`16+8*$i+0`, %f0, %f2, %f8
-+      aes_eround23    %f`16+8*$i+2`, %f0, %f2, %f2
-+      aes_eround01    %f`16+8*$i+0`, %f4, %f6, %f10
-+      aes_eround23    %f`16+8*$i+2`, %f4, %f6, %f6
-+      aes_eround01    %f`16+8*$i+4`, %f8, %f2, %f0
-+      aes_eround23    %f`16+8*$i+6`, %f8, %f2, %f2
-+      aes_eround01    %f`16+8*$i+4`, %f10, %f6, %f4
-+      aes_eround23    %f`16+8*$i+6`, %f10, %f6, %f6
-+___
-+}
-+$code.=<<___;
-+      aes_eround01    %f16, %f0, %f2, %f8
-+      aes_eround23    %f18, %f0, %f2, %f2
-+      aes_eround01    %f16, %f4, %f6, %f10
-+      aes_eround23    %f18, %f4, %f6, %f6
-+      ldd             [$key + 16], %f16
-+      ldd             [$key + 24], %f18
-+      aes_eround01_l  %f20, %f8, %f2, %f0
-+      aes_eround23_l  %f22, %f8, %f2, %f2
-+      aes_eround01_l  %f20, %f10, %f6, %f4
-+      aes_eround23_l  %f22, %f10, %f6, %f6
-+      ldd             [$key + 32], %f20
-+      retl
-+      ldd             [$key + 40], %f22
-+.type _aes256_encrypt_2x,#function
-+.size _aes256_encrypt_2x,.-_aes256_encrypt_2x
-+
-+.align        32
-+_aes256_decrypt_1x:
-+      aes_dround01    %f16, %f0, %f2, %f4
-+      aes_dround23    %f18, %f0, %f2, %f2
-+      ldd             [$key + 208], %f16
-+      ldd             [$key + 216], %f18
-+      aes_dround01    %f20, %f4, %f2, %f0
-+      aes_dround23    %f22, %f4, %f2, %f2
-+      ldd             [$key + 224], %f20
-+      ldd             [$key + 232], %f22
-+___
-+for ($i=1; $i<6; $i++) {
-+    $code.=<<___;
-+      aes_dround01    %f`16+8*$i+0`, %f0, %f2, %f4
-+      aes_dround23    %f`16+8*$i+2`, %f0, %f2, %f2
-+      aes_dround01    %f`16+8*$i+4`, %f4, %f2, %f0
-+      aes_dround23    %f`16+8*$i+6`, %f4, %f2, %f2
-+___
-+}
-+$code.=<<___;
-+      aes_dround01    %f16, %f0, %f2, %f4
-+      aes_dround23    %f18, %f0, %f2, %f2
-+      ldd             [$key + 16], %f16
-+      ldd             [$key + 24], %f18
-+      aes_dround01_l  %f20, %f4, %f2, %f0
-+      aes_dround23_l  %f22, %f4, %f2, %f2
-+      ldd             [$key + 32], %f20
-+      retl
-+      ldd             [$key + 40], %f22
-+.type _aes256_decrypt_1x,#function
-+.size _aes256_decrypt_1x,.-_aes256_decrypt_1x
-+
-+.align        32
-+_aes256_decrypt_2x:
-+      aes_dround01    %f16, %f0, %f2, %f8
-+      aes_dround23    %f18, %f0, %f2, %f2
-+      aes_dround01    %f16, %f4, %f6, %f10
-+      aes_dround23    %f18, %f4, %f6, %f6
-+      ldd             [$key + 208], %f16
-+      ldd             [$key + 216], %f18
-+      aes_dround01    %f20, %f8, %f2, %f0
-+      aes_dround23    %f22, %f8, %f2, %f2
-+      aes_dround01    %f20, %f10, %f6, %f4
-+      aes_dround23    %f22, %f10, %f6, %f6
-+      ldd             [$key + 224], %f20
-+      ldd             [$key + 232], %f22
-+___
-+for ($i=1; $i<6; $i++) {
-+    $code.=<<___;
-+      aes_dround01    %f`16+8*$i+0`, %f0, %f2, %f8
-+      aes_dround23    %f`16+8*$i+2`, %f0, %f2, %f2
-+      aes_dround01    %f`16+8*$i+0`, %f4, %f6, %f10
-+      aes_dround23    %f`16+8*$i+2`, %f4, %f6, %f6
-+      aes_dround01    %f`16+8*$i+4`, %f8, %f2, %f0
-+      aes_dround23    %f`16+8*$i+6`, %f8, %f2, %f2
-+      aes_dround01    %f`16+8*$i+4`, %f10, %f6, %f4
-+      aes_dround23    %f`16+8*$i+6`, %f10, %f6, %f6
-+___
-+}
-+$code.=<<___;
-+      aes_dround01    %f16, %f0, %f2, %f8
-+      aes_dround23    %f18, %f0, %f2, %f2
-+      aes_dround01    %f16, %f4, %f6, %f10
-+      aes_dround23    %f18, %f4, %f6, %f6
-+      ldd             [$key + 16], %f16
-+      ldd             [$key + 24], %f18
-+      aes_dround01_l  %f20, %f8, %f2, %f0
-+      aes_dround23_l  %f22, %f8, %f2, %f2
-+      aes_dround01_l  %f20, %f10, %f6, %f4
-+      aes_dround23_l  %f22, %f10, %f6, %f6
-+      ldd             [$key + 32], %f20
-+      retl
-+      ldd             [$key + 40], %f22
-+.type _aes256_decrypt_2x,#function
-+.size _aes256_decrypt_2x,.-_aes256_decrypt_2x
-+___
-+
-+&alg_cbc_encrypt_implement("aes",128);
-+&alg_cbc_encrypt_implement("aes",192);
-+&alg_cbc_encrypt_implement("aes",256);
-+
-+&alg_cbc_decrypt_implement("aes",128);
-+&alg_cbc_decrypt_implement("aes",192);
-+&alg_cbc_decrypt_implement("aes",256);
-+
-+if ($::evp) {
-+    &alg_ctr32_implement("aes",128);
-+    &alg_ctr32_implement("aes",192);
-+    &alg_ctr32_implement("aes",256);
-+}
-+}}}
-+
-+if (!$::evp) {
-+$code.=<<___;
-+.global       AES_encrypt
-+AES_encrypt=aes_t4_encrypt
-+.global       AES_decrypt
-+AES_decrypt=aes_t4_decrypt
-+.global       AES_set_encrypt_key
-+.align        32
-+AES_set_encrypt_key:
-+      andcc           %o2, 7, %g0             ! check alignment
-+      bnz,a,pn        %icc, 1f
-+      mov             -1, %o0
-+      brz,a,pn        %o0, 1f
-+      mov             -1, %o0
-+      brz,a,pn        %o2, 1f
-+      mov             -1, %o0
-+      andncc          %o1, 0x1c0, %g0
-+      bnz,a,pn        %icc, 1f
-+      mov             -2, %o0
-+      cmp             %o1, 128
-+      bl,a,pn         %icc, 1f
-+      mov             -2, %o0
-+      b               aes_t4_set_encrypt_key
-+      nop
-+1:    retl
-+      nop
-+.type AES_set_encrypt_key,#function
-+.size AES_set_encrypt_key,.-AES_set_encrypt_key
-+
-+.global       AES_set_decrypt_key
-+.align        32
-+AES_set_decrypt_key:
-+      andcc           %o2, 7, %g0             ! check alignment
-+      bnz,a,pn        %icc, 1f
-+      mov             -1, %o0
-+      brz,a,pn        %o0, 1f
-+      mov             -1, %o0
-+      brz,a,pn        %o2, 1f
-+      mov             -1, %o0
-+      andncc          %o1, 0x1c0, %g0
-+      bnz,a,pn        %icc, 1f
-+      mov             -2, %o0
-+      cmp             %o1, 128
-+      bl,a,pn         %icc, 1f
-+      mov             -2, %o0
-+      b               aes_t4_set_decrypt_key
-+      nop
-+1:    retl
-+      nop
-+.type AES_set_decrypt_key,#function
-+.size AES_set_decrypt_key,.-AES_set_decrypt_key
-+___
-+
-+my ($inp,$out,$len,$key,$ivec,$enc)=map("%o$_",(0..5));
-+
-+$code.=<<___;
-+.globl        AES_cbc_encrypt
-+.align        32
-+AES_cbc_encrypt:
-+      ld              [$key + 240], %g1
-+      nop
-+      brz             $enc, .Lcbc_decrypt
-+      cmp             %g1, 12
-+
-+      bl,pt           %icc, aes128_t4_cbc_encrypt
-+      nop
-+      be,pn           %icc, aes192_t4_cbc_encrypt
-+      nop
-+      ba              aes256_t4_cbc_encrypt
-+      nop
-+
-+.Lcbc_decrypt:
-+      bl,pt           %icc, aes128_t4_cbc_decrypt
-+      nop
-+      be,pn           %icc, aes192_t4_cbc_decrypt
-+      nop
-+      ba              aes256_t4_cbc_decrypt
-+      nop
-+.type AES_cbc_encrypt,#function
-+.size AES_cbc_encrypt,.-AES_cbc_encrypt
-+___
-+}
-+$code.=<<___;
-+.asciz        "AES for SPARC T4, David S. Miller, Andy Polyakov"
-+.align        4
-+___
-+
-+&emit_assembler();
-+
-+close STDOUT;
-Index: crypto/des/asm/dest4-sparcv9.pl
-===================================================================
-diff -uNr openssl-1.0.1m/crypto/des/asm/dest4-sparcv9.pl 
openssl-1.0.1m/crypto/des/asm/dest4-sparcv9.pl
---- openssl-1.0.1m/crypto/des/asm/dest4-sparcv9.pl 1970-01-01 
01:00:00.000000000 +0100
-+++ openssl-1.0.1m/crypto/des/asm/dest4-sparcv9.pl 2015-03-21 
16:27:38.578043100 +0100
-@@ -0,0 +1,602 @@
-+#!/usr/bin/env perl
-+
-+# ====================================================================
-+# Written by David S. Miller <[email protected]> and Andy Polyakov
-+# <[email protected]>. The module is licensed under 2-clause BSD
-+# license. March 2013. All rights reserved.
-+# ====================================================================
-+
-+######################################################################
-+# DES for SPARC T4.
-+#
-+# As with other hardware-assisted ciphers CBC encrypt results [for
-+# aligned data] are virtually identical to critical path lengths:
-+#
-+#             DES             Triple-DES
-+# CBC encrypt 4.14/4.15(*)    11.7/11.7
-+# CBC decrypt 1.77/4.11(**)   6.42/7.47
-+#
-+#                      (*)    numbers after slash are for
-+#                             misaligned data;
-+#                      (**)   this is result for largest
-+#                             block size, unlike all other
-+#                             cases smaller blocks results
-+#                             are better[?];
-+
-+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-+push(@INC,"${dir}","${dir}../../perlasm");
-+require "sparcv9_modes.pl";
-+
-+&asm_init(@ARGV);
-+
-+$code.=<<___ if ($::abibits==64);
-+.register       %g2,#scratch
-+.register       %g3,#scratch
-+___
-+
-+$code.=<<___;
-+.text
-+___
-+
-+{ my ($inp,$out)=("%o0","%o1");
-+
-+$code.=<<___;
-+.align        32
-+.globl        des_t4_key_expand
-+.type des_t4_key_expand,#function
-+des_t4_key_expand:
-+      andcc           $inp, 0x7, %g0
-+      alignaddr       $inp, %g0, $inp
-+      bz,pt           %icc, 1f
-+      ldd             [$inp + 0x00], %f0
-+      ldd             [$inp + 0x08], %f2
-+      faligndata      %f0, %f2, %f0
-+1:    des_kexpand     %f0, 0, %f0
-+      des_kexpand     %f0, 1, %f2
-+      std             %f0, [$out + 0x00]
-+      des_kexpand     %f2, 3, %f6
-+      std             %f2, [$out + 0x08]
-+      des_kexpand     %f2, 2, %f4
-+      des_kexpand     %f6, 3, %f10
-+      std             %f6, [$out + 0x18]
-+      des_kexpand     %f6, 2, %f8
-+      std             %f4, [$out + 0x10]
-+      des_kexpand     %f10, 3, %f14
-+      std             %f10, [$out + 0x28]
-+      des_kexpand     %f10, 2, %f12
-+      std             %f8, [$out + 0x20]
-+      des_kexpand     %f14, 1, %f16
-+      std             %f14, [$out + 0x38]
-+      des_kexpand     %f16, 3, %f20
-+      std             %f12, [$out + 0x30]
-+      des_kexpand     %f16, 2, %f18
-+      std             %f16, [$out + 0x40]
-+      des_kexpand     %f20, 3, %f24
-+      std             %f20, [$out + 0x50]
-+      des_kexpand     %f20, 2, %f22
-+      std             %f18, [$out + 0x48]
-+      des_kexpand     %f24, 3, %f28
-+      std             %f24, [$out + 0x60]
-+      des_kexpand     %f24, 2, %f26
-+      std             %f22, [$out + 0x58]
-+      des_kexpand     %f28, 1, %f30
-+      std             %f28, [$out + 0x70]
-+      std             %f26, [$out + 0x68]
-+      retl
-+      std             %f30, [$out + 0x78]
-+.size des_t4_key_expand,.-des_t4_key_expand
-+___
-+}
-+{ my ($inp,$out,$len,$key,$ivec) = map("%o$_",(0..4));
-+  my ($ileft,$iright,$omask) = map("%g$_",(1..3));
-+
-+$code.=<<___;
-+.globl        des_t4_cbc_encrypt
-+.align        32
-+des_t4_cbc_encrypt:
-+      ld              [$ivec + 0], %f0        ! load ivec
-+      ld              [$ivec + 4], %f1
-+
-+      and             $inp, 7, $ileft
-+      andn            $inp, 7, $inp
-+      sll             $ileft, 3, $ileft
-+      mov             0xff, $omask
-+      prefetch        [$inp], 20
-+      prefetch        [$inp + 63], 20
-+      sub             %g0, $ileft, $iright
-+      and             $out, 7, %g4
-+      alignaddrl      $out, %g0, $out
-+      srl             $omask, %g4, $omask
-+      srlx            $len, 3, $len
-+      movrz           %g4, 0, $omask
-+      prefetch        [$out], 22
-+
-+      ldd             [$key + 0x00], %f4      ! load key schedule
-+      ldd             [$key + 0x08], %f6
-+      ldd             [$key + 0x10], %f8
-+      ldd             [$key + 0x18], %f10
-+      ldd             [$key + 0x20], %f12
-+      ldd             [$key + 0x28], %f14
-+      ldd             [$key + 0x30], %f16
-+      ldd             [$key + 0x38], %f18
-+      ldd             [$key + 0x40], %f20
-+      ldd             [$key + 0x48], %f22
-+      ldd             [$key + 0x50], %f24
-+      ldd             [$key + 0x58], %f26
-+      ldd             [$key + 0x60], %f28
-+      ldd             [$key + 0x68], %f30
-+      ldd             [$key + 0x70], %f32
-+      ldd             [$key + 0x78], %f34
-+
-+.Ldes_cbc_enc_loop:
-+      ldx             [$inp + 0], %g4
-+      brz,pt          $ileft, 4f
-+      nop
-+
-+      ldx             [$inp + 8], %g5
-+      sllx            %g4, $ileft, %g4
-+      srlx            %g5, $iright, %g5
-+      or              %g5, %g4, %g4
-+4:
-+      movxtod         %g4, %f2
-+      prefetch        [$inp + 8+63], 20
-+      add             $inp, 8, $inp
-+      fxor            %f2, %f0, %f0           ! ^= ivec
-+      prefetch        [$out + 63], 22
-+
-+      des_ip          %f0, %f0
-+      des_round       %f4, %f6, %f0, %f0
-+      des_round       %f8, %f10, %f0, %f0
-+      des_round       %f12, %f14, %f0, %f0
-+      des_round       %f16, %f18, %f0, %f0
-+      des_round       %f20, %f22, %f0, %f0
-+      des_round       %f24, %f26, %f0, %f0
-+      des_round       %f28, %f30, %f0, %f0
-+      des_round       %f32, %f34, %f0, %f0
-+      des_iip         %f0, %f0
-+
-+      brnz,pn         $omask, 2f
-+      sub             $len, 1, $len
-+
-+      std             %f0, [$out + 0]
-+      brnz,pt         $len, .Ldes_cbc_enc_loop
-+      add             $out, 8, $out
-+
-+      st              %f0, [$ivec + 0]        ! write out ivec
-+      retl
-+      st              %f1, [$ivec + 4]
-+
-+.align        16
-+2:    ldxa            [$inp]0x82, %g4         ! avoid read-after-write hazard
-+                                              ! and ~4x deterioration
-+                                              ! in inp==out case
-+      faligndata      %f0, %f0, %f2           ! handle unaligned output
-+
-+      stda            %f2, [$out + $omask]0xc0        ! partial store
-+      add             $out, 8, $out
-+      orn             %g0, $omask, $omask
-+      stda            %f2, [$out + $omask]0xc0        ! partial store
-+
-+      brnz,pt         $len, .Ldes_cbc_enc_loop+4
-+      orn             %g0, $omask, $omask
-+
-+      st              %f0, [$ivec + 0]        ! write out ivec
-+      retl
-+      st              %f1, [$ivec + 4]
-+.type des_t4_cbc_encrypt,#function
-+.size des_t4_cbc_encrypt,.-des_t4_cbc_encrypt
-+
-+.globl        des_t4_cbc_decrypt
-+.align        32
-+des_t4_cbc_decrypt:
-+      ld              [$ivec + 0], %f2        ! load ivec
-+      ld              [$ivec + 4], %f3
-+
-+      and             $inp, 7, $ileft
-+      andn            $inp, 7, $inp
-+      sll             $ileft, 3, $ileft
-+      mov             0xff, $omask
-+      prefetch        [$inp], 20
-+      prefetch        [$inp + 63], 20
-+      sub             %g0, $ileft, $iright
-+      and             $out, 7, %g4
-+      alignaddrl      $out, %g0, $out
-+      srl             $omask, %g4, $omask
-+      srlx            $len, 3, $len
-+      movrz           %g4, 0, $omask
-+      prefetch        [$out], 22
-+
-+      ldd             [$key + 0x78], %f4      ! load key schedule
-+      ldd             [$key + 0x70], %f6
-+      ldd             [$key + 0x68], %f8
-+      ldd             [$key + 0x60], %f10
-+      ldd             [$key + 0x58], %f12
-+      ldd             [$key + 0x50], %f14
-+      ldd             [$key + 0x48], %f16
-+      ldd             [$key + 0x40], %f18
-+      ldd             [$key + 0x38], %f20
-+      ldd             [$key + 0x30], %f22
-+      ldd             [$key + 0x28], %f24
-+      ldd             [$key + 0x20], %f26
-+      ldd             [$key + 0x18], %f28
-+      ldd             [$key + 0x10], %f30
-+      ldd             [$key + 0x08], %f32
-+      ldd             [$key + 0x00], %f34
-+
-+.Ldes_cbc_dec_loop:
-+      ldx             [$inp + 0], %g4
-+      brz,pt          $ileft, 4f
-+      nop
-+
-+      ldx             [$inp + 8], %g5
-+      sllx            %g4, $ileft, %g4
-+      srlx            %g5, $iright, %g5
-+      or              %g5, %g4, %g4
-+4:
-+      movxtod         %g4, %f0
-+      prefetch        [$inp + 8+63], 20
-+      add             $inp, 8, $inp
-+      prefetch        [$out + 63], 22
-+
-+      des_ip          %f0, %f0
-+      des_round       %f4, %f6, %f0, %f0
-+      des_round       %f8, %f10, %f0, %f0
-+      des_round       %f12, %f14, %f0, %f0
-+      des_round       %f16, %f18, %f0, %f0
-+      des_round       %f20, %f22, %f0, %f0
-+      des_round       %f24, %f26, %f0, %f0
-+      des_round       %f28, %f30, %f0, %f0
-+      des_round       %f32, %f34, %f0, %f0
-+      des_iip         %f0, %f0
-+
-+      fxor            %f2, %f0, %f0           ! ^= ivec
-+      movxtod         %g4, %f2
-+
-+      brnz,pn         $omask, 2f
-+      sub             $len, 1, $len
-+
-+      std             %f0, [$out + 0]
-+      brnz,pt         $len, .Ldes_cbc_dec_loop
-+      add             $out, 8, $out
-+
-+      st              %f2, [$ivec + 0]        ! write out ivec
-+      retl
-+      st              %f3, [$ivec + 4]
-+
-+.align        16
-+2:    ldxa            [$inp]0x82, %g4         ! avoid read-after-write hazard
-+                                              ! and ~4x deterioration
-+                                              ! in inp==out case
-+      faligndata      %f0, %f0, %f0           ! handle unaligned output
-+
-+      stda            %f0, [$out + $omask]0xc0        ! partial store
-+      add             $out, 8, $out
-+      orn             %g0, $omask, $omask
-+      stda            %f0, [$out + $omask]0xc0        ! partial store
-+
-+      brnz,pt         $len, .Ldes_cbc_dec_loop+4
-+      orn             %g0, $omask, $omask
-+
-+      st              %f2, [$ivec + 0]        ! write out ivec
-+      retl
-+      st              %f3, [$ivec + 4]
-+.type des_t4_cbc_decrypt,#function
-+.size des_t4_cbc_decrypt,.-des_t4_cbc_decrypt
-+___
-+
-+# One might wonder why does one have back-to-back des_iip/des_ip
-+# pairs between EDE passes. Indeed, aren't they inverse of each other?
-+# They almost are. Outcome of the pair is 32-bit words being swapped
-+# in target register. Consider pair of des_iip/des_ip as a way to
-+# perform the due swap, it's actually fastest way in this case.
-+
-+$code.=<<___;
-+.globl        des_t4_ede3_cbc_encrypt
-+.align        32
-+des_t4_ede3_cbc_encrypt:
-+      ld              [$ivec + 0], %f0        ! load ivec
-+      ld              [$ivec + 4], %f1
-+
-+      and             $inp, 7, $ileft
-+      andn            $inp, 7, $inp
-+      sll             $ileft, 3, $ileft
-+      mov             0xff, $omask
-+      prefetch        [$inp], 20
-+      prefetch        [$inp + 63], 20
-+      sub             %g0, $ileft, $iright
-+      and             $out, 7, %g4
-+      alignaddrl      $out, %g0, $out
-+      srl             $omask, %g4, $omask
-+      srlx            $len, 3, $len
-+      movrz           %g4, 0, $omask
-+      prefetch        [$out], 22
-+
-+      ldd             [$key + 0x00], %f4      ! load key schedule
-+      ldd             [$key + 0x08], %f6
-+      ldd             [$key + 0x10], %f8
-+      ldd             [$key + 0x18], %f10
-+      ldd             [$key + 0x20], %f12
-+      ldd             [$key + 0x28], %f14
-+      ldd             [$key + 0x30], %f16
-+      ldd             [$key + 0x38], %f18
-+      ldd             [$key + 0x40], %f20
-+      ldd             [$key + 0x48], %f22
-+      ldd             [$key + 0x50], %f24
-+      ldd             [$key + 0x58], %f26
-+      ldd             [$key + 0x60], %f28
-+      ldd             [$key + 0x68], %f30
-+      ldd             [$key + 0x70], %f32
-+      ldd             [$key + 0x78], %f34
-+
-+.Ldes_ede3_cbc_enc_loop:
-+      ldx             [$inp + 0], %g4
-+      brz,pt          $ileft, 4f
-+      nop
-+
-+      ldx             [$inp + 8], %g5
-+      sllx            %g4, $ileft, %g4
-+      srlx            %g5, $iright, %g5
-+      or              %g5, %g4, %g4
-+4:
-+      movxtod         %g4, %f2
-+      prefetch        [$inp + 8+63], 20
-+      add             $inp, 8, $inp
-+      fxor            %f2, %f0, %f0           ! ^= ivec
-+      prefetch        [$out + 63], 22
-+
-+      des_ip          %f0, %f0
-+      des_round       %f4, %f6, %f0, %f0
-+      des_round       %f8, %f10, %f0, %f0
-+      des_round       %f12, %f14, %f0, %f0
-+      des_round       %f16, %f18, %f0, %f0
-+      ldd             [$key + 0x100-0x08], %f36
-+      ldd             [$key + 0x100-0x10], %f38
-+      des_round       %f20, %f22, %f0, %f0
-+      ldd             [$key + 0x100-0x18], %f40
-+      ldd             [$key + 0x100-0x20], %f42
-+      des_round       %f24, %f26, %f0, %f0
-+      ldd             [$key + 0x100-0x28], %f44
-+      ldd             [$key + 0x100-0x30], %f46
-+      des_round       %f28, %f30, %f0, %f0
-+      ldd             [$key + 0x100-0x38], %f48
-+      ldd             [$key + 0x100-0x40], %f50
-+      des_round       %f32, %f34, %f0, %f0
-+      ldd             [$key + 0x100-0x48], %f52
-+      ldd             [$key + 0x100-0x50], %f54
-+      des_iip         %f0, %f0
-+
-+      ldd             [$key + 0x100-0x58], %f56
-+      ldd             [$key + 0x100-0x60], %f58
-+      des_ip          %f0, %f0
-+      ldd             [$key + 0x100-0x68], %f60
-+      ldd             [$key + 0x100-0x70], %f62
-+      des_round       %f36, %f38, %f0, %f0
-+      ldd             [$key + 0x100-0x78], %f36
-+      ldd             [$key + 0x100-0x80], %f38
-+      des_round       %f40, %f42, %f0, %f0
-+      des_round       %f44, %f46, %f0, %f0
-+      des_round       %f48, %f50, %f0, %f0
-+      ldd             [$key + 0x100+0x00], %f40
-+      ldd             [$key + 0x100+0x08], %f42
-+      des_round       %f52, %f54, %f0, %f0
-+      ldd             [$key + 0x100+0x10], %f44
-+      ldd             [$key + 0x100+0x18], %f46
-+      des_round       %f56, %f58, %f0, %f0
-+      ldd             [$key + 0x100+0x20], %f48
-+      ldd             [$key + 0x100+0x28], %f50
-+      des_round       %f60, %f62, %f0, %f0
-+      ldd             [$key + 0x100+0x30], %f52
-+      ldd             [$key + 0x100+0x38], %f54
-+      des_round       %f36, %f38, %f0, %f0
-+      ldd             [$key + 0x100+0x40], %f56
-+      ldd             [$key + 0x100+0x48], %f58
-+      des_iip         %f0, %f0
-+
-+      ldd             [$key + 0x100+0x50], %f60
-+      ldd             [$key + 0x100+0x58], %f62
-+      des_ip          %f0, %f0
-+      ldd             [$key + 0x100+0x60], %f36
-+      ldd             [$key + 0x100+0x68], %f38
-+      des_round       %f40, %f42, %f0, %f0
-+      ldd             [$key + 0x100+0x70], %f40
-+      ldd             [$key + 0x100+0x78], %f42
-+      des_round       %f44, %f46, %f0, %f0
-+      des_round       %f48, %f50, %f0, %f0
-+      des_round       %f52, %f54, %f0, %f0
-+      des_round       %f56, %f58, %f0, %f0
-+      des_round       %f60, %f62, %f0, %f0
-+      des_round       %f36, %f38, %f0, %f0
-+      des_round       %f40, %f42, %f0, %f0
-+      des_iip         %f0, %f0
-+
-+      brnz,pn         $omask, 2f
-+      sub             $len, 1, $len
-+
-+      std             %f0, [$out + 0]
-+      brnz,pt         $len, .Ldes_ede3_cbc_enc_loop
-+      add             $out, 8, $out
-+
-+      st              %f0, [$ivec + 0]        ! write out ivec
-+      retl
-+      st              %f1, [$ivec + 4]
-+
-+.align        16
-+2:    ldxa            [$inp]0x82, %g4         ! avoid read-after-write hazard
-+                                              ! and ~2x deterioration
-+                                              ! in inp==out case
-+      faligndata      %f0, %f0, %f2           ! handle unaligned output
-+
-+      stda            %f2, [$out + $omask]0xc0        ! partial store
-+      add             $out, 8, $out
-+      orn             %g0, $omask, $omask
-+      stda            %f2, [$out + $omask]0xc0        ! partial store
-+
-+      brnz,pt         $len, .Ldes_ede3_cbc_enc_loop+4
-+      orn             %g0, $omask, $omask
-+
-+      st              %f0, [$ivec + 0]        ! write out ivec
-+      retl
-+      st              %f1, [$ivec + 4]
-+.type des_t4_ede3_cbc_encrypt,#function
-+.size des_t4_ede3_cbc_encrypt,.-des_t4_ede3_cbc_encrypt
-+
-+.globl        des_t4_ede3_cbc_decrypt
-+.align        32
-+des_t4_ede3_cbc_decrypt:
-+      ld              [$ivec + 0], %f2        ! load ivec
-+      ld              [$ivec + 4], %f3
-+
-+      and             $inp, 7, $ileft
-+      andn            $inp, 7, $inp
-+      sll             $ileft, 3, $ileft
-+      mov             0xff, $omask
-+      prefetch        [$inp], 20
-+      prefetch        [$inp + 63], 20
-+      sub             %g0, $ileft, $iright
-+      and             $out, 7, %g4
-+      alignaddrl      $out, %g0, $out
-+      srl             $omask, %g4, $omask
-+      srlx            $len, 3, $len
-+      movrz           %g4, 0, $omask
-+      prefetch        [$out], 22
-+
-+      ldd             [$key + 0x100+0x78], %f4        ! load key schedule
-+      ldd             [$key + 0x100+0x70], %f6
-+      ldd             [$key + 0x100+0x68], %f8
-+      ldd             [$key + 0x100+0x60], %f10
-+      ldd             [$key + 0x100+0x58], %f12
-+      ldd             [$key + 0x100+0x50], %f14
-+      ldd             [$key + 0x100+0x48], %f16
-+      ldd             [$key + 0x100+0x40], %f18
-+      ldd             [$key + 0x100+0x38], %f20
-+      ldd             [$key + 0x100+0x30], %f22
-+      ldd             [$key + 0x100+0x28], %f24
-+      ldd             [$key + 0x100+0x20], %f26
-+      ldd             [$key + 0x100+0x18], %f28
-+      ldd             [$key + 0x100+0x10], %f30
-+      ldd             [$key + 0x100+0x08], %f32
-+      ldd             [$key + 0x100+0x00], %f34
-+
-+.Ldes_ede3_cbc_dec_loop:
-+      ldx             [$inp + 0], %g4
-+      brz,pt          $ileft, 4f
-+      nop
-+
-+      ldx             [$inp + 8], %g5
-+      sllx            %g4, $ileft, %g4
-+      srlx            %g5, $iright, %g5
-+      or              %g5, %g4, %g4
-+4:
-+      movxtod         %g4, %f0
-+      prefetch        [$inp + 8+63], 20
-+      add             $inp, 8, $inp
-+      prefetch        [$out + 63], 22
-+
-+      des_ip          %f0, %f0
-+      des_round       %f4, %f6, %f0, %f0
-+      des_round       %f8, %f10, %f0, %f0
-+      des_round       %f12, %f14, %f0, %f0
-+      des_round       %f16, %f18, %f0, %f0
-+      ldd             [$key + 0x80+0x00], %f36
-+      ldd             [$key + 0x80+0x08], %f38
-+      des_round       %f20, %f22, %f0, %f0
-+      ldd             [$key + 0x80+0x10], %f40
-+      ldd             [$key + 0x80+0x18], %f42
-+      des_round       %f24, %f26, %f0, %f0
-+      ldd             [$key + 0x80+0x20], %f44
-+      ldd             [$key + 0x80+0x28], %f46
-+      des_round       %f28, %f30, %f0, %f0
-+      ldd             [$key + 0x80+0x30], %f48
-+      ldd             [$key + 0x80+0x38], %f50
-+      des_round       %f32, %f34, %f0, %f0
-+      ldd             [$key + 0x80+0x40], %f52
-+      ldd             [$key + 0x80+0x48], %f54
-+      des_iip         %f0, %f0
-+
-+      ldd             [$key + 0x80+0x50], %f56
-+      ldd             [$key + 0x80+0x58], %f58
-+      des_ip          %f0, %f0
-+      ldd             [$key + 0x80+0x60], %f60
-+      ldd             [$key + 0x80+0x68], %f62
-+      des_round       %f36, %f38, %f0, %f0
-+      ldd             [$key + 0x80+0x70], %f36
-+      ldd             [$key + 0x80+0x78], %f38
-+      des_round       %f40, %f42, %f0, %f0
-+      des_round       %f44, %f46, %f0, %f0
-+      des_round       %f48, %f50, %f0, %f0
-+      ldd             [$key + 0x80-0x08], %f40
-+      ldd             [$key + 0x80-0x10], %f42
-+      des_round       %f52, %f54, %f0, %f0
-+      ldd             [$key + 0x80-0x18], %f44
-+      ldd             [$key + 0x80-0x20], %f46
-+      des_round       %f56, %f58, %f0, %f0
-+      ldd             [$key + 0x80-0x28], %f48
-+      ldd             [$key + 0x80-0x30], %f50
-+      des_round       %f60, %f62, %f0, %f0
-+      ldd             [$key + 0x80-0x38], %f52
-+      ldd             [$key + 0x80-0x40], %f54
-+      des_round       %f36, %f38, %f0, %f0
-+      ldd             [$key + 0x80-0x48], %f56
-+      ldd             [$key + 0x80-0x50], %f58
-+      des_iip         %f0, %f0
-+
-+      ldd             [$key + 0x80-0x58], %f60
-+      ldd             [$key + 0x80-0x60], %f62
-+      des_ip          %f0, %f0
-+      ldd             [$key + 0x80-0x68], %f36
-+      ldd             [$key + 0x80-0x70], %f38
-+      des_round       %f40, %f42, %f0, %f0
-+      ldd             [$key + 0x80-0x78], %f40
-+      ldd             [$key + 0x80-0x80], %f42
-+      des_round       %f44, %f46, %f0, %f0
-+      des_round       %f48, %f50, %f0, %f0
-+      des_round       %f52, %f54, %f0, %f0
-+      des_round       %f56, %f58, %f0, %f0
-+      des_round       %f60, %f62, %f0, %f0
-+      des_round       %f36, %f38, %f0, %f0
-+      des_round       %f40, %f42, %f0, %f0
-+      des_iip         %f0, %f0
-+
-+      fxor            %f2, %f0, %f0           ! ^= ivec
-+      movxtod         %g4, %f2
-+
-+      brnz,pn         $omask, 2f
-+      sub             $len, 1, $len
-+
-+      std             %f0, [$out + 0]
-+      brnz,pt         $len, .Ldes_ede3_cbc_dec_loop
-+      add             $out, 8, $out
-+
-+      st              %f2, [$ivec + 0]        ! write out ivec
-+      retl
-+      st              %f3, [$ivec + 4]
-+
-+.align        16
-+2:    ldxa            [$inp]0x82, %g4         ! avoid read-after-write hazard
-+                                              ! and ~3x deterioration
-+                                              ! in inp==out case
-+      faligndata      %f0, %f0, %f0           ! handle unaligned output
-+
-+      stda            %f0, [$out + $omask]0xc0        ! partial store
-+      add             $out, 8, $out
-+      orn             %g0, $omask, $omask
-+      stda            %f0, [$out + $omask]0xc0        ! partial store
-+
-+      brnz,pt         $len, .Ldes_ede3_cbc_dec_loop+4
-+      orn             %g0, $omask, $omask
-+
-+      st              %f2, [$ivec + 0]        ! write out ivec
-+      retl
-+      st              %f3, [$ivec + 4]
-+.type des_t4_ede3_cbc_decrypt,#function
-+.size des_t4_ede3_cbc_decrypt,.-des_t4_ede3_cbc_decrypt
-+___
-+}
-+$code.=<<___;
-+.asciz  "DES for SPARC T4, David S. Miller, Andy Polyakov"
-+.align  4
-+___
-+
-+&emit_assembler();
-+
-+close STDOUT;
-Index: crypto/perlasm/sparcv9_modes.pl
-===================================================================
-diff -uNr openssl-1.0.1m/crypto/perlasm/sparcv9_modes.pl 
openssl-1.0.1m/crypto/perlasm/sparcv9_modes.pl
---- openssl-1.0.1m/crypto/perlasm/sparcv9_modes.pl 1970-01-01 
01:00:00.000000000 +0100
-+++ openssl-1.0.1m/crypto/perlasm/sparcv9_modes.pl 2015-03-21 
16:27:38.578043100 +0100
-@@ -0,0 +1,1680 @@
-+#!/usr/bin/env perl
-+
-+# Specific modes implementations for SPARC Architecture 2011. There
-+# is T4 dependency though, an ASI value that is not specified in the
-+# Architecture Manual. But as SPARC universe is rather monocultural,
-+# we imply that processor capable of executing crypto instructions
-+# can handle the ASI in question as well. This means that we ought to
-+# keep eyes open when new processors emerge...
-+#
-+# As for above mentioned ASI. It's so called "block initializing
-+# store" which cancels "read" in "read-update-write" on cache lines.
-+# This is "cooperative" optimization, as it reduces overall pressure
-+# on memory interface. Benefits can't be observed/quantified with
-+# usual benchmarks, on the contrary you can notice that single-thread
-+# performance for parallelizable modes is ~1.5% worse for largest
-+# block sizes [though few percent better for not so long ones]. All
-+# this based on suggestions from David Miller.
-+
-+sub asm_init {                # to be called with @ARGV as argument
-+    for (@_)          { $::abibits=64 if (/\-m64/ || /\-xarch\=v9/); }
-+    if ($::abibits==64)       { $::bias=2047; $::frame=192; 
$::size_t_cc="%xcc"; }
-+    else              { $::bias=0;    $::frame=112; $::size_t_cc="%icc"; }
-+}
-+
-+# unified interface
-+my ($inp,$out,$len,$key,$ivec)=map("%i$_",(0..5));
-+# local variables
-+my ($ileft,$iright,$ooff,$omask,$ivoff,$blk_init)=map("%l$_",(0..7));
-+
-+sub alg_cbc_encrypt_implement {
-+my ($alg,$bits) = @_;
-+
-+$::code.=<<___;
-+.globl        ${alg}${bits}_t4_cbc_encrypt
-+.align        32
-+${alg}${bits}_t4_cbc_encrypt:
-+      save            %sp, -$::frame, %sp
-+      sub             $inp, $out, $blk_init   ! $inp!=$out
-+___
-+$::code.=<<___ if (!$::evp);
-+      andcc           $ivec, 7, $ivoff
-+      alignaddr       $ivec, %g0, $ivec
-+
-+      ldd             [$ivec + 0], %f0        ! load ivec
-+      bz,pt           %icc, 1f
-+      ldd             [$ivec + 8], %f2
-+      ldd             [$ivec + 16], %f4
-+      faligndata      %f0, %f2, %f0
-+      faligndata      %f2, %f4, %f2
-+1:
-+___
-+$::code.=<<___ if ($::evp);
-+      ld              [$ivec + 0], %f0
-+      ld              [$ivec + 4], %f1
-+      ld              [$ivec + 8], %f2
-+      ld              [$ivec + 12], %f3
-+___
-+$::code.=<<___;
-+      prefetch        [$inp], 20
-+      prefetch        [$inp + 63], 20
-+      call            _${alg}${bits}_load_enckey
-+      and             $inp, 7, $ileft
-+      andn            $inp, 7, $inp
-+      sll             $ileft, 3, $ileft
-+      mov             64, $iright
-+      mov             0xff, $omask
-+      sub             $iright, $ileft, $iright
-+      and             $out, 7, $ooff
-+      cmp             $len, 127
-+      movrnz          $ooff, 0, $blk_init             ! if (  $out&7 ||
-+      movleu          $::size_t_cc, 0, $blk_init      !       $len<128 ||
-+      brnz,pn         $blk_init, .L${bits}cbc_enc_blk !       $inp==$out)
-+      srl             $omask, $ooff, $omask
-+
-+      alignaddrl      $out, %g0, $out
-+      srlx            $len, 4, $len
-+      prefetch        [$out], 22
-+
-+.L${bits}_cbc_enc_loop:
-+      ldx             [$inp + 0], %o0
-+      brz,pt          $ileft, 4f
-+      ldx             [$inp + 8], %o1
-+
-+      ldx             [$inp + 16], %o2
-+      sllx            %o0, $ileft, %o0
-+      srlx            %o1, $iright, %g1
-+      sllx            %o1, $ileft, %o1
-+      or              %g1, %o0, %o0
-+      srlx            %o2, $iright, %o2
-+      or              %o2, %o1, %o1
-+4:
-+      xor             %g4, %o0, %o0           ! ^= rk[0]
-+      xor             %g5, %o1, %o1
-+      movxtod         %o0, %f12
-+      movxtod         %o1, %f14
-+
-+      fxor            %f12, %f0, %f0          ! ^= ivec
-+      fxor            %f14, %f2, %f2
-+      prefetch        [$out + 63], 22
-+      prefetch        [$inp + 16+63], 20
-+      call            _${alg}${bits}_encrypt_1x
-+      add             $inp, 16, $inp
-+
-+      brnz,pn         $ooff, 2f
-+      sub             $len, 1, $len
-+              
-+      std             %f0, [$out + 0]
-+      std             %f2, [$out + 8]
-+      brnz,pt         $len, .L${bits}_cbc_enc_loop
-+      add             $out, 16, $out
-+___
-+$::code.=<<___ if ($::evp);
-+      st              %f0, [$ivec + 0]
-+      st              %f1, [$ivec + 4]
-+      st              %f2, [$ivec + 8]
-+      st              %f3, [$ivec + 12]
-+___
-+$::code.=<<___ if (!$::evp);
-+      brnz,pn         $ivoff, 3f
-+      nop
-+
-+      std             %f0, [$ivec + 0]        ! write out ivec
-+      std             %f2, [$ivec + 8]
-+___
-+$::code.=<<___;
-+      ret
-+      restore
-+
-+.align        16
-+2:    ldxa            [$inp]0x82, %o0         ! avoid read-after-write hazard
-+                                              ! and ~3x deterioration
-+                                              ! in inp==out case
-+      faligndata      %f0, %f0, %f4           ! handle unaligned output
-+      faligndata      %f0, %f2, %f6
-+      faligndata      %f2, %f2, %f8
-+
-+      stda            %f4, [$out + $omask]0xc0        ! partial store
-+      std             %f6, [$out + 8]
-+      add             $out, 16, $out
-+      orn             %g0, $omask, $omask
-+      stda            %f8, [$out + $omask]0xc0        ! partial store
-+
-+      brnz,pt         $len, .L${bits}_cbc_enc_loop+4
-+      orn             %g0, $omask, $omask
-+___
-+$::code.=<<___ if ($::evp);
-+      st              %f0, [$ivec + 0]
-+      st              %f1, [$ivec + 4]
-+      st              %f2, [$ivec + 8]
-+      st              %f3, [$ivec + 12]
-+___
-+$::code.=<<___ if (!$::evp);
-+      brnz,pn         $ivoff, 3f
-+      nop
-+
-+      std             %f0, [$ivec + 0]        ! write out ivec
-+      std             %f2, [$ivec + 8]
-+      ret
-+      restore
-+
-+.align        16
-+3:    alignaddrl      $ivec, $ivoff, %g0      ! handle unaligned ivec
-+      mov             0xff, $omask
-+      srl             $omask, $ivoff, $omask
-+      faligndata      %f0, %f0, %f4
-+      faligndata      %f0, %f2, %f6
-+      faligndata      %f2, %f2, %f8
-+      stda            %f4, [$ivec + $omask]0xc0
-+      std             %f6, [$ivec + 8]
-+      add             $ivec, 16, $ivec
-+      orn             %g0, $omask, $omask
-+      stda            %f8, [$ivec + $omask]0xc0
-+___
-+$::code.=<<___;
-+      ret
-+      restore
-+
-+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-+.align        32
-+.L${bits}cbc_enc_blk:
-+      add     $out, $len, $blk_init
-+      and     $blk_init, 63, $blk_init        ! tail
-+      sub     $len, $blk_init, $len
-+      add     $blk_init, 15, $blk_init        ! round up to 16n
-+      srlx    $len, 4, $len
-+      srl     $blk_init, 4, $blk_init
-+
-+.L${bits}_cbc_enc_blk_loop:
-+      ldx             [$inp + 0], %o0
-+      brz,pt          $ileft, 5f
-+      ldx             [$inp + 8], %o1
-+
-+      ldx             [$inp + 16], %o2
-+      sllx            %o0, $ileft, %o0
-+      srlx            %o1, $iright, %g1
-+      sllx            %o1, $ileft, %o1
-+      or              %g1, %o0, %o0
-+      srlx            %o2, $iright, %o2
-+      or              %o2, %o1, %o1
-+5:
-+      xor             %g4, %o0, %o0           ! ^= rk[0]
-+      xor             %g5, %o1, %o1
-+      movxtod         %o0, %f12
-+      movxtod         %o1, %f14
-+
-+      fxor            %f12, %f0, %f0          ! ^= ivec
-+      fxor            %f14, %f2, %f2
-+      prefetch        [$inp + 16+63], 20
-+      call            _${alg}${bits}_encrypt_1x
-+      add             $inp, 16, $inp
-+      sub             $len, 1, $len
-+              
-+      stda            %f0, [$out]0xe2         ! ASI_BLK_INIT, T4-specific
-+      add             $out, 8, $out
-+      stda            %f2, [$out]0xe2         ! ASI_BLK_INIT, T4-specific
-+      brnz,pt         $len, .L${bits}_cbc_enc_blk_loop
-+      add             $out, 8, $out
-+
-+      membar          #StoreLoad|#StoreStore
-+      brnz,pt         $blk_init, .L${bits}_cbc_enc_loop
-+      mov             $blk_init, $len
-+___
-+$::code.=<<___ if ($::evp);
-+      st              %f0, [$ivec + 0]
-+      st              %f1, [$ivec + 4]
-+      st              %f2, [$ivec + 8]
-+      st              %f3, [$ivec + 12]
-+___
-+$::code.=<<___ if (!$::evp);
-+      brnz,pn         $ivoff, 3b
-+      nop
-+
-+      std             %f0, [$ivec + 0]        ! write out ivec
-+      std             %f2, [$ivec + 8]
-+___
-+$::code.=<<___;
-+      ret
-+      restore
-+.type ${alg}${bits}_t4_cbc_encrypt,#function
-+.size ${alg}${bits}_t4_cbc_encrypt,.-${alg}${bits}_t4_cbc_encrypt
-+___
-+}
-+
-+sub alg_cbc_decrypt_implement {
-+my ($alg,$bits) = @_;
-+
-+$::code.=<<___;
-+.globl        ${alg}${bits}_t4_cbc_decrypt
-+.align        32
-+${alg}${bits}_t4_cbc_decrypt:
-+      save            %sp, -$::frame, %sp
-+      sub             $inp, $out, $blk_init   ! $inp!=$out
-+___
-+$::code.=<<___ if (!$::evp);
-+      andcc           $ivec, 7, $ivoff
-+      alignaddr       $ivec, %g0, $ivec
-+
-+      ldd             [$ivec + 0], %f12       ! load ivec
-+      bz,pt           %icc, 1f
-+      ldd             [$ivec + 8], %f14
-+      ldd             [$ivec + 16], %f0
-+      faligndata      %f12, %f14, %f12
-+      faligndata      %f14, %f0, %f14
-+1:
-+___
-+$::code.=<<___ if ($::evp);
-+      ld              [$ivec + 0], %f12       ! load ivec
-+      ld              [$ivec + 4], %f13
-+      ld              [$ivec + 8], %f14
-+      ld              [$ivec + 12], %f15
-+___
-+$::code.=<<___;
-+      prefetch        [$inp], 20
-+      prefetch        [$inp + 63], 20
-+      call            _${alg}${bits}_load_deckey
-+      and             $inp, 7, $ileft
-+      andn            $inp, 7, $inp
-+      sll             $ileft, 3, $ileft
-+      mov             64, $iright
-+      mov             0xff, $omask
-+      sub             $iright, $ileft, $iright
-+      and             $out, 7, $ooff
-+      cmp             $len, 255
-+      movrnz          $ooff, 0, $blk_init             ! if (  $out&7 ||
-+      movleu          $::size_t_cc, 0, $blk_init      !       $len<256 ||
-+      brnz,pn         $blk_init, .L${bits}cbc_dec_blk !       $inp==$out)
-+      srl             $omask, $ooff, $omask
-+
-+      andcc           $len, 16, %g0           ! is number of blocks even?
-+      srlx            $len, 4, $len
-+      alignaddrl      $out, %g0, $out
-+      bz              %icc, .L${bits}_cbc_dec_loop2x
-+      prefetch        [$out], 22
-+.L${bits}_cbc_dec_loop:
-+      ldx             [$inp + 0], %o0
-+      brz,pt          $ileft, 4f
-+      ldx             [$inp + 8], %o1
-+
-+      ldx             [$inp + 16], %o2
-+      sllx            %o0, $ileft, %o0
-+      srlx            %o1, $iright, %g1
-+      sllx            %o1, $ileft, %o1
-+      or              %g1, %o0, %o0
-+      srlx            %o2, $iright, %o2
-+      or              %o2, %o1, %o1
-+4:
-+      xor             %g4, %o0, %o2           ! ^= rk[0]
-+      xor             %g5, %o1, %o3
-+      movxtod         %o2, %f0
-+      movxtod         %o3, %f2
-+
-+      prefetch        [$out + 63], 22
-+      prefetch        [$inp + 16+63], 20
-+      call            _${alg}${bits}_decrypt_1x
-+      add             $inp, 16, $inp
-+
-+      fxor            %f12, %f0, %f0          ! ^= ivec
-+      fxor            %f14, %f2, %f2
-+      movxtod         %o0, %f12
-+      movxtod         %o1, %f14
-+
-+      brnz,pn         $ooff, 2f
-+      sub             $len, 1, $len
-+              
-+      std             %f0, [$out + 0]
-+      std             %f2, [$out + 8]
-+      brnz,pt         $len, .L${bits}_cbc_dec_loop2x
-+      add             $out, 16, $out
-+___
-+$::code.=<<___ if ($::evp);
-+      st              %f12, [$ivec + 0]
-+      st              %f13, [$ivec + 4]
-+      st              %f14, [$ivec + 8]
-+      st              %f15, [$ivec + 12]
-+___
-+$::code.=<<___ if (!$::evp);
-+      brnz,pn         $ivoff, .L${bits}_cbc_dec_unaligned_ivec
-+      nop
-+
-+      std             %f12, [$ivec + 0]       ! write out ivec
-+      std             %f14, [$ivec + 8]
-+___
-+$::code.=<<___;
-+      ret
-+      restore
-+
-+.align        16
-+2:    ldxa            [$inp]0x82, %o0         ! avoid read-after-write hazard
-+                                              ! and ~3x deterioration
-+                                              ! in inp==out case
-+      faligndata      %f0, %f0, %f4           ! handle unaligned output
-+      faligndata      %f0, %f2, %f6
-+      faligndata      %f2, %f2, %f8
-+
-+      stda            %f4, [$out + $omask]0xc0        ! partial store
-+      std             %f6, [$out + 8]
-+      add             $out, 16, $out
-+      orn             %g0, $omask, $omask
-+      stda            %f8, [$out + $omask]0xc0        ! partial store
-+
-+      brnz,pt         $len, .L${bits}_cbc_dec_loop2x+4
-+      orn             %g0, $omask, $omask
-+___
-+$::code.=<<___ if ($::evp);
-+      st              %f12, [$ivec + 0]
-+      st              %f13, [$ivec + 4]
-+      st              %f14, [$ivec + 8]
-+      st              %f15, [$ivec + 12]
-+___
-+$::code.=<<___ if (!$::evp);
-+      brnz,pn         $ivoff, .L${bits}_cbc_dec_unaligned_ivec
-+      nop
-+
-+      std             %f12, [$ivec + 0]       ! write out ivec
-+      std             %f14, [$ivec + 8]
-+___
-+$::code.=<<___;
-+      ret
-+      restore
-+
-+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-+.align        32
-+.L${bits}_cbc_dec_loop2x:
-+      ldx             [$inp + 0], %o0
-+      ldx             [$inp + 8], %o1
-+      ldx             [$inp + 16], %o2
-+      brz,pt          $ileft, 4f
-+      ldx             [$inp + 24], %o3
-+
-+      ldx             [$inp + 32], %o4
-+      sllx            %o0, $ileft, %o0
-+      srlx            %o1, $iright, %g1
-+      or              %g1, %o0, %o0
-+      sllx            %o1, $ileft, %o1
-+      srlx            %o2, $iright, %g1
-+      or              %g1, %o1, %o1
-+      sllx            %o2, $ileft, %o2
-+      srlx            %o3, $iright, %g1
-+      or              %g1, %o2, %o2
-+      sllx            %o3, $ileft, %o3
-+      srlx            %o4, $iright, %o4
-+      or              %o4, %o3, %o3
-+4:
-+      xor             %g4, %o0, %o4           ! ^= rk[0]
-+      xor             %g5, %o1, %o5
-+      movxtod         %o4, %f0
-+      movxtod         %o5, %f2
-+      xor             %g4, %o2, %o4
-+      xor             %g5, %o3, %o5
-+      movxtod         %o4, %f4
-+      movxtod         %o5, %f6
-+
-+      prefetch        [$out + 63], 22
-+      prefetch        [$inp + 32+63], 20
-+      call            _${alg}${bits}_decrypt_2x
-+      add             $inp, 32, $inp
-+
-+      movxtod         %o0, %f8
-+      movxtod         %o1, %f10
-+      fxor            %f12, %f0, %f0          ! ^= ivec
-+      fxor            %f14, %f2, %f2
-+      movxtod         %o2, %f12
-+      movxtod         %o3, %f14
-+      fxor            %f8, %f4, %f4
-+      fxor            %f10, %f6, %f6
-+
-+      brnz,pn         $ooff, 2f
-+      sub             $len, 2, $len
-+              
-+      std             %f0, [$out + 0]
-+      std             %f2, [$out + 8]
-+      std             %f4, [$out + 16]
-+      std             %f6, [$out + 24]
-+      brnz,pt         $len, .L${bits}_cbc_dec_loop2x
-+      add             $out, 32, $out
-+___
-+$::code.=<<___ if ($::evp);
-+      st              %f12, [$ivec + 0]
-+      st              %f13, [$ivec + 4]
-+      st              %f14, [$ivec + 8]
-+      st              %f15, [$ivec + 12]
-+___
-+$::code.=<<___ if (!$::evp);
-+      brnz,pn         $ivoff, .L${bits}_cbc_dec_unaligned_ivec
-+      nop
-+
-+      std             %f12, [$ivec + 0]       ! write out ivec
-+      std             %f14, [$ivec + 8]
-+___
-+$::code.=<<___;
-+      ret
-+      restore
-+
-+.align        16
-+2:    ldxa            [$inp]0x82, %o0         ! avoid read-after-write hazard
-+                                              ! and ~3x deterioration
-+                                              ! in inp==out case
-+      faligndata      %f0, %f0, %f8           ! handle unaligned output
-+      faligndata      %f0, %f2, %f0
-+      faligndata      %f2, %f4, %f2
-+      faligndata      %f4, %f6, %f4
-+      faligndata      %f6, %f6, %f6
-+      stda            %f8, [$out + $omask]0xc0        ! partial store
-+      std             %f0, [$out + 8]
-+      std             %f2, [$out + 16]
-+      std             %f4, [$out + 24]
-+      add             $out, 32, $out
-+      orn             %g0, $omask, $omask
-+      stda            %f6, [$out + $omask]0xc0        ! partial store
-+
-+      brnz,pt         $len, .L${bits}_cbc_dec_loop2x+4
-+      orn             %g0, $omask, $omask
-+___
-+$::code.=<<___ if ($::evp);
-+      st              %f12, [$ivec + 0]
-+      st              %f13, [$ivec + 4]
-+      st              %f14, [$ivec + 8]
-+      st              %f15, [$ivec + 12]
-+___
-+$::code.=<<___ if (!$::evp);
-+      brnz,pn         $ivoff, .L${bits}_cbc_dec_unaligned_ivec
-+      nop
-+
-+      std             %f12, [$ivec + 0]       ! write out ivec
-+      std             %f14, [$ivec + 8]
-+      ret
-+      restore
-+
-+.align        16
-+.L${bits}_cbc_dec_unaligned_ivec:
-+      alignaddrl      $ivec, $ivoff, %g0      ! handle unaligned ivec
-+      mov             0xff, $omask
-+      srl             $omask, $ivoff, $omask
-+      faligndata      %f12, %f12, %f0
-+      faligndata      %f12, %f14, %f2
-+      faligndata      %f14, %f14, %f4
-+      stda            %f0, [$ivec + $omask]0xc0
-+      std             %f2, [$ivec + 8]
-+      add             $ivec, 16, $ivec
-+      orn             %g0, $omask, $omask
-+      stda            %f4, [$ivec + $omask]0xc0
-+___
-+$::code.=<<___;
-+      ret
-+      restore
-+
-+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-+.align        32
-+.L${bits}cbc_dec_blk:
-+      add     $out, $len, $blk_init
-+      and     $blk_init, 63, $blk_init        ! tail
-+      sub     $len, $blk_init, $len
-+      add     $blk_init, 15, $blk_init        ! round up to 16n
-+      srlx    $len, 4, $len
-+      srl     $blk_init, 4, $blk_init
-+      sub     $len, 1, $len
-+      add     $blk_init, 1, $blk_init
-+
-+.L${bits}_cbc_dec_blk_loop2x:
-+      ldx             [$inp + 0], %o0
-+      ldx             [$inp + 8], %o1
-+      ldx             [$inp + 16], %o2
-+      brz,pt          $ileft, 5f
-+      ldx             [$inp + 24], %o3
-+
-+      ldx             [$inp + 32], %o4
-+      sllx            %o0, $ileft, %o0
-+      srlx            %o1, $iright, %g1
-+      or              %g1, %o0, %o0
-+      sllx            %o1, $ileft, %o1
-+      srlx            %o2, $iright, %g1
-+      or              %g1, %o1, %o1
-+      sllx            %o2, $ileft, %o2
-+      srlx            %o3, $iright, %g1
-+      or              %g1, %o2, %o2
-+      sllx            %o3, $ileft, %o3
-+      srlx            %o4, $iright, %o4
-+      or              %o4, %o3, %o3
-+5:
-+      xor             %g4, %o0, %o4           ! ^= rk[0]
-+      xor             %g5, %o1, %o5
-+      movxtod         %o4, %f0
-+      movxtod         %o5, %f2
-+      xor             %g4, %o2, %o4
-+      xor             %g5, %o3, %o5
-+      movxtod         %o4, %f4
-+      movxtod         %o5, %f6
-+
-+      prefetch        [$inp + 32+63], 20
-+      call            _${alg}${bits}_decrypt_2x
-+      add             $inp, 32, $inp
-+      subcc           $len, 2, $len
-+
-+      movxtod         %o0, %f8
-+      movxtod         %o1, %f10
-+      fxor            %f12, %f0, %f0          ! ^= ivec
-+      fxor            %f14, %f2, %f2
-+      movxtod         %o2, %f12
-+      movxtod         %o3, %f14
-+      fxor            %f8, %f4, %f4
-+      fxor            %f10, %f6, %f6
-+
-+      stda            %f0, [$out]0xe2         ! ASI_BLK_INIT, T4-specific
-+      add             $out, 8, $out
-+      stda            %f2, [$out]0xe2         ! ASI_BLK_INIT, T4-specific
-+      add             $out, 8, $out
-+      stda            %f4, [$out]0xe2         ! ASI_BLK_INIT, T4-specific
-+      add             $out, 8, $out
-+      stda            %f6, [$out]0xe2         ! ASI_BLK_INIT, T4-specific
-+      bgu,pt          $::size_t_cc, .L${bits}_cbc_dec_blk_loop2x
-+      add             $out, 8, $out
-+
-+      add             $blk_init, $len, $len
-+      andcc           $len, 1, %g0            ! is number of blocks even?
-+      membar          #StoreLoad|#StoreStore
-+      bnz,pt          %icc, .L${bits}_cbc_dec_loop
-+      srl             $len, 0, $len
-+      brnz,pn         $len, .L${bits}_cbc_dec_loop2x
-+      nop
-+___
-+$::code.=<<___ if ($::evp);
-+      st              %f12, [$ivec + 0]       ! write out ivec
-+      st              %f13, [$ivec + 4]
-+      st              %f14, [$ivec + 8]
-+      st              %f15, [$ivec + 12]
-+___
-+$::code.=<<___ if (!$::evp);
-+      brnz,pn         $ivoff, 3b
-+      nop
-+
-+      std             %f12, [$ivec + 0]       ! write out ivec
-+      std             %f14, [$ivec + 8]
-+___
-+$::code.=<<___;
-+      ret
-+      restore
-+.type ${alg}${bits}_t4_cbc_decrypt,#function
-+.size ${alg}${bits}_t4_cbc_decrypt,.-${alg}${bits}_t4_cbc_decrypt
-+___
-+}
-+
-+sub alg_ctr32_implement {
-+my ($alg,$bits) = @_;
-+
-+$::code.=<<___;
-+.globl        ${alg}${bits}_t4_ctr32_encrypt
-+.align        32
-+${alg}${bits}_t4_ctr32_encrypt:
-+      save            %sp, -$::frame, %sp
-+
-+      prefetch        [$inp], 20
-+      prefetch        [$inp + 63], 20
-+      call            _${alg}${bits}_load_enckey
-+      sllx            $len, 4, $len
-+
-+      ld              [$ivec + 0], %l4        ! counter
-+      ld              [$ivec + 4], %l5
-+      ld              [$ivec + 8], %l6
-+      ld              [$ivec + 12], %l7
-+
-+      sllx            %l4, 32, %o5
-+      or              %l5, %o5, %o5
-+      sllx            %l6, 32, %g1
-+      xor             %o5, %g4, %g4           ! ^= rk[0]
-+      xor             %g1, %g5, %g5
-+      movxtod         %g4, %f14               ! most significant 64 bits
-+
-+      sub             $inp, $out, $blk_init   ! $inp!=$out
-+      and             $inp, 7, $ileft
-+      andn            $inp, 7, $inp
-+      sll             $ileft, 3, $ileft
-+      mov             64, $iright
-+      mov             0xff, $omask
-+      sub             $iright, $ileft, $iright
-+      and             $out, 7, $ooff
-+      cmp             $len, 255
-+      movrnz          $ooff, 0, $blk_init             ! if (  $out&7 ||
-+      movleu          $::size_t_cc, 0, $blk_init      !       $len<256 ||
-+      brnz,pn         $blk_init, .L${bits}_ctr32_blk  !       $inp==$out)
-+      srl             $omask, $ooff, $omask
-+
-+      andcc           $len, 16, %g0           ! is number of blocks even?
-+      alignaddrl      $out, %g0, $out
-+      bz              %icc, .L${bits}_ctr32_loop2x
-+      srlx            $len, 4, $len
-+.L${bits}_ctr32_loop:
-+      ldx             [$inp + 0], %o0
-+      brz,pt          $ileft, 4f
-+      ldx             [$inp + 8], %o1
-+
-+      ldx             [$inp + 16], %o2
-+      sllx            %o0, $ileft, %o0
-+      srlx            %o1, $iright, %g1
-+      sllx            %o1, $ileft, %o1
-+      or              %g1, %o0, %o0
-+      srlx            %o2, $iright, %o2
-+      or              %o2, %o1, %o1
-+4:
-+      xor             %g5, %l7, %g1           ! ^= rk[0]
-+      add             %l7, 1, %l7
-+      movxtod         %g1, %f2
-+      srl             %l7, 0, %l7             ! clruw
-+      prefetch        [$out + 63], 22
-+      prefetch        [$inp + 16+63], 20
-+___
-+$::code.=<<___ if ($alg eq "aes");
-+      aes_eround01    %f16, %f14, %f2, %f4
-+      aes_eround23    %f18, %f14, %f2, %f2
-+___
-+$::code.=<<___ if ($alg eq "cmll");
-+      camellia_f      %f16, %f2, %f14, %f2
-+      camellia_f      %f18, %f14, %f2, %f0
-+___
-+$::code.=<<___;
-+      call            _${alg}${bits}_encrypt_1x+8
-+      add             $inp, 16, $inp
-+
-+      movxtod         %o0, %f10
-+      movxtod         %o1, %f12
-+      fxor            %f10, %f0, %f0          ! ^= inp
-+      fxor            %f12, %f2, %f2
-+
-+      brnz,pn         $ooff, 2f
-+      sub             $len, 1, $len
-+              
-+      std             %f0, [$out + 0]
-+      std             %f2, [$out + 8]
-+      brnz,pt         $len, .L${bits}_ctr32_loop2x
-+      add             $out, 16, $out
-+
-+      ret
-+      restore
-+
-+.align        16
-+2:    ldxa            [$inp]0x82, %o0         ! avoid read-after-write hazard
-+                                              ! and ~3x deterioration
-+                                              ! in inp==out case
-+      faligndata      %f0, %f0, %f4           ! handle unaligned output
-+      faligndata      %f0, %f2, %f6
-+      faligndata      %f2, %f2, %f8
-+      stda            %f4, [$out + $omask]0xc0        ! partial store
-+      std             %f6, [$out + 8]
-+      add             $out, 16, $out
-+      orn             %g0, $omask, $omask
-+      stda            %f8, [$out + $omask]0xc0        ! partial store
-+
-+      brnz,pt         $len, .L${bits}_ctr32_loop2x+4
-+      orn             %g0, $omask, $omask
-+
-+      ret
-+      restore
-+
-+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-+.align        32
-+.L${bits}_ctr32_loop2x:
-+      ldx             [$inp + 0], %o0
-+      ldx             [$inp + 8], %o1
-+      ldx             [$inp + 16], %o2
-+      brz,pt          $ileft, 4f
-+      ldx             [$inp + 24], %o3
-+
-+      ldx             [$inp + 32], %o4
-+      sllx            %o0, $ileft, %o0
-+      srlx            %o1, $iright, %g1
-+      or              %g1, %o0, %o0
-+      sllx            %o1, $ileft, %o1
-+      srlx            %o2, $iright, %g1
-+      or              %g1, %o1, %o1
-+      sllx            %o2, $ileft, %o2
-+      srlx            %o3, $iright, %g1
-+      or              %g1, %o2, %o2
-+      sllx            %o3, $ileft, %o3
-+      srlx            %o4, $iright, %o4
-+      or              %o4, %o3, %o3
-+4:
-+      xor             %g5, %l7, %g1           ! ^= rk[0]
-+      add             %l7, 1, %l7
-+      movxtod         %g1, %f2
-+      srl             %l7, 0, %l7             ! clruw
-+      xor             %g5, %l7, %g1
-+      add             %l7, 1, %l7
-+      movxtod         %g1, %f6
-+      srl             %l7, 0, %l7             ! clruw
-+      prefetch        [$out + 63], 22
-+      prefetch        [$inp + 32+63], 20
-+___
-+$::code.=<<___ if ($alg eq "aes");
-+      aes_eround01    %f16, %f14, %f2, %f8
-+      aes_eround23    %f18, %f14, %f2, %f2
-+      aes_eround01    %f16, %f14, %f6, %f10
-+      aes_eround23    %f18, %f14, %f6, %f6
-+___
-+$::code.=<<___ if ($alg eq "cmll");
-+      camellia_f      %f16, %f2, %f14, %f2
-+      camellia_f      %f16, %f6, %f14, %f6
-+      camellia_f      %f18, %f14, %f2, %f0
-+      camellia_f      %f18, %f14, %f6, %f4
-+___
-+$::code.=<<___;
-+      call            _${alg}${bits}_encrypt_2x+16
-+      add             $inp, 32, $inp
-+
-+      movxtod         %o0, %f8
-+      movxtod         %o1, %f10
-+      movxtod         %o2, %f12
-+      fxor            %f8, %f0, %f0           ! ^= inp
-+      movxtod         %o3, %f8
-+      fxor            %f10, %f2, %f2
-+      fxor            %f12, %f4, %f4
-+      fxor            %f8, %f6, %f6
-+
-+      brnz,pn         $ooff, 2f
-+      sub             $len, 2, $len
-+              
-+      std             %f0, [$out + 0]
-+      std             %f2, [$out + 8]
-+      std             %f4, [$out + 16]
-+      std             %f6, [$out + 24]
-+      brnz,pt         $len, .L${bits}_ctr32_loop2x
-+      add             $out, 32, $out
-+
-+      ret
-+      restore
-+
-+.align        16
-+2:    ldxa            [$inp]0x82, %o0         ! avoid read-after-write hazard
-+                                              ! and ~3x deterioration
-+                                              ! in inp==out case
-+      faligndata      %f0, %f0, %f8           ! handle unaligned output
-+      faligndata      %f0, %f2, %f0
-+      faligndata      %f2, %f4, %f2
-+      faligndata      %f4, %f6, %f4
-+      faligndata      %f6, %f6, %f6
-+
-+      stda            %f8, [$out + $omask]0xc0        ! partial store
-+      std             %f0, [$out + 8]
-+      std             %f2, [$out + 16]
-+      std             %f4, [$out + 24]
-+      add             $out, 32, $out
-+      orn             %g0, $omask, $omask
-+      stda            %f6, [$out + $omask]0xc0        ! partial store
-+
-+      brnz,pt         $len, .L${bits}_ctr32_loop2x+4
-+      orn             %g0, $omask, $omask
-+
-+      ret
-+      restore
-+
-+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-+.align        32
-+.L${bits}_ctr32_blk:
-+      add     $out, $len, $blk_init
-+      and     $blk_init, 63, $blk_init        ! tail
-+      sub     $len, $blk_init, $len
-+      add     $blk_init, 15, $blk_init        ! round up to 16n
-+      srlx    $len, 4, $len
-+      srl     $blk_init, 4, $blk_init
-+      sub     $len, 1, $len
-+      add     $blk_init, 1, $blk_init
-+
-+.L${bits}_ctr32_blk_loop2x:
-+      ldx             [$inp + 0], %o0
-+      ldx             [$inp + 8], %o1
-+      ldx             [$inp + 16], %o2
-+      brz,pt          $ileft, 5f
-+      ldx             [$inp + 24], %o3
-+
-+      ldx             [$inp + 32], %o4
-+      sllx            %o0, $ileft, %o0
-+      srlx            %o1, $iright, %g1
-+      or              %g1, %o0, %o0
-+      sllx            %o1, $ileft, %o1
-+      srlx            %o2, $iright, %g1
-+      or              %g1, %o1, %o1
-+      sllx            %o2, $ileft, %o2
-+      srlx            %o3, $iright, %g1
-+      or              %g1, %o2, %o2
-+      sllx            %o3, $ileft, %o3
-+      srlx            %o4, $iright, %o4
-+      or              %o4, %o3, %o3
-+5:
-+      xor             %g5, %l7, %g1           ! ^= rk[0]
-+      add             %l7, 1, %l7
-+      movxtod         %g1, %f2
-+      srl             %l7, 0, %l7             ! clruw
-+      xor             %g5, %l7, %g1
-+      add             %l7, 1, %l7
-+      movxtod         %g1, %f6
-+      srl             %l7, 0, %l7             ! clruw
-+      prefetch        [$inp + 32+63], 20
-+___
-+$::code.=<<___ if ($alg eq "aes");
-+      aes_eround01    %f16, %f14, %f2, %f8
-+      aes_eround23    %f18, %f14, %f2, %f2
-+      aes_eround01    %f16, %f14, %f6, %f10
-+      aes_eround23    %f18, %f14, %f6, %f6
-+___
-+$::code.=<<___ if ($alg eq "cmll");
-+      camellia_f      %f16, %f2, %f14, %f2
-+      camellia_f      %f16, %f6, %f14, %f6
-+      camellia_f      %f18, %f14, %f2, %f0
-+      camellia_f      %f18, %f14, %f6, %f4
-+___
-+$::code.=<<___;
-+      call            _${alg}${bits}_encrypt_2x+16
-+      add             $inp, 32, $inp
-+      subcc           $len, 2, $len
-+
-+      movxtod         %o0, %f8
-+      movxtod         %o1, %f10
-+      movxtod         %o2, %f12
-+      fxor            %f8, %f0, %f0           ! ^= inp
-+      movxtod         %o3, %f8
-+      fxor            %f10, %f2, %f2
-+      fxor            %f12, %f4, %f4
-+      fxor            %f8, %f6, %f6
-+
-+      stda            %f0, [$out]0xe2         ! ASI_BLK_INIT, T4-specific
-+      add             $out, 8, $out
-+      stda            %f2, [$out]0xe2         ! ASI_BLK_INIT, T4-specific
-+      add             $out, 8, $out
-+      stda            %f4, [$out]0xe2         ! ASI_BLK_INIT, T4-specific
-+      add             $out, 8, $out
-+      stda            %f6, [$out]0xe2         ! ASI_BLK_INIT, T4-specific
-+      bgu,pt          $::size_t_cc, .L${bits}_ctr32_blk_loop2x
-+      add             $out, 8, $out
-+
-+      add             $blk_init, $len, $len
-+      andcc           $len, 1, %g0            ! is number of blocks even?
-+      membar          #StoreLoad|#StoreStore
-+      bnz,pt          %icc, .L${bits}_ctr32_loop
-+      srl             $len, 0, $len
-+      brnz,pn         $len, .L${bits}_ctr32_loop2x
-+      nop
-+
-+      ret
-+      restore
-+.type ${alg}${bits}_t4_ctr32_encrypt,#function
-+.size ${alg}${bits}_t4_ctr32_encrypt,.-${alg}${bits}_t4_ctr32_encrypt
-+___
-+}
-+
-+sub alg_xts_implement {
-+my ($alg,$bits,$dir) = @_;
-+my ($inp,$out,$len,$key1,$key2,$ivec)=map("%i$_",(0..5));
-+my $rem=$ivec;
-+
-+$::code.=<<___;
-+.globl        ${alg}${bits}_t4_xts_${dir}crypt
-+.align        32
-+${alg}${bits}_t4_xts_${dir}crypt:
-+      save            %sp, -$::frame-16, %sp
-+
-+      mov             $ivec, %o0
-+      add             %fp, $::bias-16, %o1
-+      call            ${alg}_t4_encrypt
-+      mov             $key2, %o2
-+
-+      add             %fp, $::bias-16, %l7
-+      ldxa            [%l7]0x88, %g2
-+      add             %fp, $::bias-8, %l7
-+      ldxa            [%l7]0x88, %g3          ! %g3:%g2 is tweak
-+
-+      sethi           %hi(0x76543210), %l7
-+      or              %l7, %lo(0x76543210), %l7
-+      bmask           %l7, %g0, %g0           ! byte swap mask
-+
-+      prefetch        [$inp], 20
-+      prefetch        [$inp + 63], 20
-+      call            _${alg}${bits}_load_${dir}ckey
-+      and             $len, 15,  $rem
-+      and             $len, -16, $len
-+___
-+$code.=<<___ if ($dir eq "de");
-+      mov             0, %l7
-+      movrnz          $rem, 16,  %l7
-+      sub             $len, %l7, $len
-+___
-+$code.=<<___;
-+
-+      sub             $inp, $out, $blk_init   ! $inp!=$out
-+      and             $inp, 7, $ileft
-+      andn            $inp, 7, $inp
-+      sll             $ileft, 3, $ileft
-+      mov             64, $iright
-+      mov             0xff, $omask
-+      sub             $iright, $ileft, $iright
-+      and             $out, 7, $ooff
-+      cmp             $len, 255
-+      movrnz          $ooff, 0, $blk_init             ! if (  $out&7 ||
-+      movleu          $::size_t_cc, 0, $blk_init      !       $len<256 ||
-+      brnz,pn         $blk_init, .L${bits}_xts_${dir}blk !    $inp==$out)
-+      srl             $omask, $ooff, $omask
-+
-+      andcc           $len, 16, %g0           ! is number of blocks even?
-+___
-+$code.=<<___ if ($dir eq "de");
-+      brz,pn          $len, .L${bits}_xts_${dir}steal
-+___
-+$code.=<<___;
-+      alignaddrl      $out, %g0, $out
-+      bz              %icc, .L${bits}_xts_${dir}loop2x
-+      srlx            $len, 4, $len
-+.L${bits}_xts_${dir}loop:
-+      ldx             [$inp + 0], %o0
-+      brz,pt          $ileft, 4f
-+      ldx             [$inp + 8], %o1
-+
-+      ldx             [$inp + 16], %o2
-+      sllx            %o0, $ileft, %o0
-+      srlx            %o1, $iright, %g1
-+      sllx            %o1, $ileft, %o1
-+      or              %g1, %o0, %o0
-+      srlx            %o2, $iright, %o2
-+      or              %o2, %o1, %o1
-+4:
-+      movxtod         %g2, %f12
-+      movxtod         %g3, %f14
-+      bshuffle        %f12, %f12, %f12
-+      bshuffle        %f14, %f14, %f14
-+
-+      xor             %g4, %o0, %o0           ! ^= rk[0]
-+      xor             %g5, %o1, %o1
-+      movxtod         %o0, %f0
-+      movxtod         %o1, %f2
-+
-+      fxor            %f12, %f0, %f0          ! ^= tweak[0]
-+      fxor            %f14, %f2, %f2
-+
-+      prefetch        [$out + 63], 22
-+      prefetch        [$inp + 16+63], 20
-+      call            _${alg}${bits}_${dir}crypt_1x
-+      add             $inp, 16, $inp
-+
-+      fxor            %f12, %f0, %f0          ! ^= tweak[0]
-+      fxor            %f14, %f2, %f2
-+
-+      srax            %g3, 63, %l7            ! next tweak value
-+      addcc           %g2, %g2, %g2
-+      and             %l7, 0x87, %l7
-+      addxc           %g3, %g3, %g3
-+      xor             %l7, %g2, %g2
-+
-+      brnz,pn         $ooff, 2f
-+      sub             $len, 1, $len
-+              
-+      std             %f0, [$out + 0]
-+      std             %f2, [$out + 8]
-+      brnz,pt         $len, .L${bits}_xts_${dir}loop2x
-+      add             $out, 16, $out
-+
-+      brnz,pn         $rem, .L${bits}_xts_${dir}steal
-+      nop
-+
-+      ret
-+      restore
-+
-+.align        16
-+2:    ldxa            [$inp]0x82, %o0         ! avoid read-after-write hazard
-+                                              ! and ~3x deterioration
-+                                              ! in inp==out case
-+      faligndata      %f0, %f0, %f4           ! handle unaligned output
-+      faligndata      %f0, %f2, %f6
-+      faligndata      %f2, %f2, %f8
-+      stda            %f4, [$out + $omask]0xc0        ! partial store
-+      std             %f6, [$out + 8]
-+      add             $out, 16, $out
-+      orn             %g0, $omask, $omask
-+      stda            %f8, [$out + $omask]0xc0        ! partial store
-+
-+      brnz,pt         $len, .L${bits}_xts_${dir}loop2x+4
-+      orn             %g0, $omask, $omask
-+
-+      brnz,pn         $rem, .L${bits}_xts_${dir}steal
-+      nop
-+
-+      ret
-+      restore
-+
-+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-+.align        32
-+.L${bits}_xts_${dir}loop2x:
-+      ldx             [$inp + 0], %o0
-+      ldx             [$inp + 8], %o1
-+      ldx             [$inp + 16], %o2
-+      brz,pt          $ileft, 4f
-+      ldx             [$inp + 24], %o3
-+
-+      ldx             [$inp + 32], %o4
-+      sllx            %o0, $ileft, %o0
-+      srlx            %o1, $iright, %g1
-+      or              %g1, %o0, %o0
-+      sllx            %o1, $ileft, %o1
-+      srlx            %o2, $iright, %g1
-+      or              %g1, %o1, %o1
-+      sllx            %o2, $ileft, %o2
-+      srlx            %o3, $iright, %g1
-+      or              %g1, %o2, %o2
-+      sllx            %o3, $ileft, %o3
-+      srlx            %o4, $iright, %o4
-+      or              %o4, %o3, %o3
-+4:
-+      movxtod         %g2, %f12
-+      movxtod         %g3, %f14
-+      bshuffle        %f12, %f12, %f12
-+      bshuffle        %f14, %f14, %f14
-+
-+      srax            %g3, 63, %l7            ! next tweak value
-+      addcc           %g2, %g2, %g2
-+      and             %l7, 0x87, %l7
-+      addxc           %g3, %g3, %g3
-+      xor             %l7, %g2, %g2
-+
-+      movxtod         %g2, %f8
-+      movxtod         %g3, %f10
-+      bshuffle        %f8,  %f8,  %f8
-+      bshuffle        %f10, %f10, %f10
-+
-+      xor             %g4, %o0, %o0           ! ^= rk[0]
-+      xor             %g5, %o1, %o1
-+      xor             %g4, %o2, %o2           ! ^= rk[0]
-+      xor             %g5, %o3, %o3
-+      movxtod         %o0, %f0
-+      movxtod         %o1, %f2
-+      movxtod         %o2, %f4
-+      movxtod         %o3, %f6
-+
-+      fxor            %f12, %f0, %f0          ! ^= tweak[0]
-+      fxor            %f14, %f2, %f2
-+      fxor            %f8,  %f4, %f4          ! ^= tweak[0]
-+      fxor            %f10, %f6, %f6
-+
-+      prefetch        [$out + 63], 22
-+      prefetch        [$inp + 32+63], 20
-+      call            _${alg}${bits}_${dir}crypt_2x
-+      add             $inp, 32, $inp
-+
-+      movxtod         %g2, %f8
-+      movxtod         %g3, %f10
-+
-+      srax            %g3, 63, %l7            ! next tweak value
-+      addcc           %g2, %g2, %g2
-+      and             %l7, 0x87, %l7
-+      addxc           %g3, %g3, %g3
-+      xor             %l7, %g2, %g2
-+
-+      bshuffle        %f8,  %f8,  %f8
-+      bshuffle        %f10, %f10, %f10
-+
-+      fxor            %f12, %f0, %f0          ! ^= tweak[0]
-+      fxor            %f14, %f2, %f2
-+      fxor            %f8,  %f4, %f4
-+      fxor            %f10, %f6, %f6
-+
-+      brnz,pn         $ooff, 2f
-+      sub             $len, 2, $len
-+              
-+      std             %f0, [$out + 0]
-+      std             %f2, [$out + 8]
-+      std             %f4, [$out + 16]
-+      std             %f6, [$out + 24]
-+      brnz,pt         $len, .L${bits}_xts_${dir}loop2x
-+      add             $out, 32, $out
-+
-+      fsrc2           %f4, %f0
-+      fsrc2           %f6, %f2
-+      brnz,pn         $rem, .L${bits}_xts_${dir}steal
-+      nop
-+
-+      ret
-+      restore
-+
-+.align        16
-+2:    ldxa            [$inp]0x82, %o0         ! avoid read-after-write hazard
-+                                              ! and ~3x deterioration
-+                                              ! in inp==out case
-+      faligndata      %f0, %f0, %f8           ! handle unaligned output
-+      faligndata      %f0, %f2, %f10
-+      faligndata      %f2, %f4, %f12
-+      faligndata      %f4, %f6, %f14
-+      faligndata      %f6, %f6, %f0
-+
-+      stda            %f8, [$out + $omask]0xc0        ! partial store
-+      std             %f10, [$out + 8]
-+      std             %f12, [$out + 16]
-+      std             %f14, [$out + 24]
-+      add             $out, 32, $out
-+      orn             %g0, $omask, $omask
-+      stda            %f0, [$out + $omask]0xc0        ! partial store
-+
-+      brnz,pt         $len, .L${bits}_xts_${dir}loop2x+4
-+      orn             %g0, $omask, $omask
-+
-+      fsrc2           %f4, %f0
-+      fsrc2           %f6, %f2
-+      brnz,pn         $rem, .L${bits}_xts_${dir}steal
-+      nop
-+
-+      ret
-+      restore
-+
-+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-+.align        32
-+.L${bits}_xts_${dir}blk:
-+      add     $out, $len, $blk_init
-+      and     $blk_init, 63, $blk_init        ! tail
-+      sub     $len, $blk_init, $len
-+      add     $blk_init, 15, $blk_init        ! round up to 16n
-+      srlx    $len, 4, $len
-+      srl     $blk_init, 4, $blk_init
-+      sub     $len, 1, $len
-+      add     $blk_init, 1, $blk_init
-+
-+.L${bits}_xts_${dir}blk2x:
-+      ldx             [$inp + 0], %o0
-+      ldx             [$inp + 8], %o1
-+      ldx             [$inp + 16], %o2
-+      brz,pt          $ileft, 5f
-+      ldx             [$inp + 24], %o3
-+
-+      ldx             [$inp + 32], %o4
-+      sllx            %o0, $ileft, %o0
-+      srlx            %o1, $iright, %g1
-+      or              %g1, %o0, %o0
-+      sllx            %o1, $ileft, %o1
-+      srlx            %o2, $iright, %g1
-+      or              %g1, %o1, %o1
-+      sllx            %o2, $ileft, %o2
-+      srlx            %o3, $iright, %g1
-+      or              %g1, %o2, %o2
-+      sllx            %o3, $ileft, %o3
-+      srlx            %o4, $iright, %o4
-+      or              %o4, %o3, %o3
-+5:
-+      movxtod         %g2, %f12
-+      movxtod         %g3, %f14
-+      bshuffle        %f12, %f12, %f12
-+      bshuffle        %f14, %f14, %f14
-+
-+      srax            %g3, 63, %l7            ! next tweak value
-+      addcc           %g2, %g2, %g2
-+      and             %l7, 0x87, %l7
-+      addxc           %g3, %g3, %g3
-+      xor             %l7, %g2, %g2
-+
-+      movxtod         %g2, %f8
-+      movxtod         %g3, %f10
-+      bshuffle        %f8,  %f8,  %f8
-+      bshuffle        %f10, %f10, %f10
-+
-+      xor             %g4, %o0, %o0           ! ^= rk[0]
-+      xor             %g5, %o1, %o1
-+      xor             %g4, %o2, %o2           ! ^= rk[0]
-+      xor             %g5, %o3, %o3
-+      movxtod         %o0, %f0
-+      movxtod         %o1, %f2
-+      movxtod         %o2, %f4
-+      movxtod         %o3, %f6
-+
-+      fxor            %f12, %f0, %f0          ! ^= tweak[0]
-+      fxor            %f14, %f2, %f2
-+      fxor            %f8,  %f4, %f4          ! ^= tweak[0]
-+      fxor            %f10, %f6, %f6
-+
-+      prefetch        [$inp + 32+63], 20
-+      call            _${alg}${bits}_${dir}crypt_2x
-+      add             $inp, 32, $inp
-+
-+      movxtod         %g2, %f8
-+      movxtod         %g3, %f10
-+
-+      srax            %g3, 63, %l7            ! next tweak value
-+      addcc           %g2, %g2, %g2
-+      and             %l7, 0x87, %l7
-+      addxc           %g3, %g3, %g3
-+      xor             %l7, %g2, %g2
-+
-+      bshuffle        %f8,  %f8,  %f8
-+      bshuffle        %f10, %f10, %f10
-+
-+      fxor            %f12, %f0, %f0          ! ^= tweak[0]
-+      fxor            %f14, %f2, %f2
-+      fxor            %f8,  %f4, %f4
-+      fxor            %f10, %f6, %f6
-+
-+      stda            %f0, [$out]0xe2         ! ASI_BLK_INIT, T4-specific
-+      add             $out, 8, $out
-+      stda            %f2, [$out]0xe2         ! ASI_BLK_INIT, T4-specific
-+      add             $out, 8, $out
-+      stda            %f4, [$out]0xe2         ! ASI_BLK_INIT, T4-specific
-+      add             $out, 8, $out
-+      stda            %f6, [$out]0xe2         ! ASI_BLK_INIT, T4-specific
-+      bgu,pt          $::size_t_cc, .L${bits}_xts_${dir}blk2x
-+      add             $out, 8, $out
-+
-+      add             $blk_init, $len, $len
-+      andcc           $len, 1, %g0            ! is number of blocks even?
-+      membar          #StoreLoad|#StoreStore
-+      bnz,pt          %icc, .L${bits}_xts_${dir}loop
-+      srl             $len, 0, $len
-+      brnz,pn         $len, .L${bits}_xts_${dir}loop2x
-+      nop
-+
-+      fsrc2           %f4, %f0
-+      fsrc2           %f6, %f2
-+      brnz,pn         $rem, .L${bits}_xts_${dir}steal
-+      nop
-+
-+      ret
-+      restore
-+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-+___
-+$code.=<<___ if ($dir eq "en");
-+.align        32
-+.L${bits}_xts_${dir}steal:
-+      std             %f0, [%fp + $::bias-16] ! copy of output
-+      std             %f2, [%fp + $::bias-8]
-+
-+      srl             $ileft, 3, $ileft
-+      add             %fp, $::bias-16, %l7
-+      add             $inp, $ileft, $inp      ! original $inp+$len&-15
-+      add             $out, $ooff, $out       ! original $out+$len&-15
-+      mov             0, $ileft
-+      nop                                     ! align
-+
-+.L${bits}_xts_${dir}stealing:
-+      ldub            [$inp + $ileft], %o0
-+      ldub            [%l7  + $ileft], %o1
-+      dec             $rem
-+      stb             %o0, [%l7  + $ileft]
-+      stb             %o1, [$out + $ileft]
-+      brnz            $rem, .L${bits}_xts_${dir}stealing
-+      inc             $ileft
-+
-+      mov             %l7, $inp
-+      sub             $out, 16, $out
-+      mov             0, $ileft
-+      sub             $out, $ooff, $out
-+      ba              .L${bits}_xts_${dir}loop        ! one more time
-+      mov             1, $len                         ! $rem is 0
-+___
-+$code.=<<___ if ($dir eq "de");
-+.align        32
-+.L${bits}_xts_${dir}steal:
-+      ldx             [$inp + 0], %o0
-+      brz,pt          $ileft, 8f
-+      ldx             [$inp + 8], %o1
-+
-+      ldx             [$inp + 16], %o2
-+      sllx            %o0, $ileft, %o0
-+      srlx            %o1, $iright, %g1
-+      sllx            %o1, $ileft, %o1
-+      or              %g1, %o0, %o0
-+      srlx            %o2, $iright, %o2
-+      or              %o2, %o1, %o1
-+8:
-+      srax            %g3, 63, %l7            ! next tweak value
-+      addcc           %g2, %g2, %o2
-+      and             %l7, 0x87, %l7
-+      addxc           %g3, %g3, %o3
-+      xor             %l7, %o2, %o2
-+
-+      movxtod         %o2, %f12
-+      movxtod         %o3, %f14
-+      bshuffle        %f12, %f12, %f12
-+      bshuffle        %f14, %f14, %f14
-+
-+      xor             %g4, %o0, %o0           ! ^= rk[0]
-+      xor             %g5, %o1, %o1
-+      movxtod         %o0, %f0
-+      movxtod         %o1, %f2
-+
-+      fxor            %f12, %f0, %f0          ! ^= tweak[0]
-+      fxor            %f14, %f2, %f2
-+
-+      call            _${alg}${bits}_${dir}crypt_1x
-+      add             $inp, 16, $inp
-+
-+      fxor            %f12, %f0, %f0          ! ^= tweak[0]
-+      fxor            %f14, %f2, %f2
-+
-+      std             %f0, [%fp + $::bias-16]
-+      std             %f2, [%fp + $::bias-8]
-+
-+      srl             $ileft, 3, $ileft
-+      add             %fp, $::bias-16, %l7
-+      add             $inp, $ileft, $inp      ! original $inp+$len&-15
-+      add             $out, $ooff, $out       ! original $out+$len&-15
-+      mov             0, $ileft
-+      add             $out, 16, $out
-+      nop                                     ! align
-+
-+.L${bits}_xts_${dir}stealing:
-+      ldub            [$inp + $ileft], %o0
-+      ldub            [%l7  + $ileft], %o1
-+      dec             $rem
-+      stb             %o0, [%l7  + $ileft]
-+      stb             %o1, [$out + $ileft]
-+      brnz            $rem, .L${bits}_xts_${dir}stealing
-+      inc             $ileft
-+
-+      mov             %l7, $inp
-+      sub             $out, 16, $out
-+      mov             0, $ileft
-+      sub             $out, $ooff, $out
-+      ba              .L${bits}_xts_${dir}loop        ! one more time
-+      mov             1, $len                         ! $rem is 0
-+___
-+$code.=<<___;
-+      ret
-+      restore
-+.type ${alg}${bits}_t4_xts_${dir}crypt,#function
-+.size ${alg}${bits}_t4_xts_${dir}crypt,.-${alg}${bits}_t4_xts_${dir}crypt
-+___
-+}
-+
-+# Purpose of these subroutines is to explicitly encode VIS instructions,
-+# so that one can compile the module without having to specify VIS
-+# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
-+# Idea is to reserve for option to produce "universal" binary and let
-+# programmer detect if current CPU is VIS capable at run-time.
-+sub unvis {
-+my ($mnemonic,$rs1,$rs2,$rd)=@_;
-+my ($ref,$opf);
-+my %visopf = (        "faligndata"    => 0x048,
-+              "bshuffle"      => 0x04c,
-+              "fnot2"         => 0x066,
-+              "fxor"          => 0x06c,
-+              "fsrc2"         => 0x078        );
-+
-+    $ref = "$mnemonic\t$rs1,$rs2,$rd";
-+
-+    if ($opf=$visopf{$mnemonic}) {
-+      foreach ($rs1,$rs2,$rd) {
-+          return $ref if (!/%f([0-9]{1,2})/);
-+          $_=$1;
-+          if ($1>=32) {
-+              return $ref if ($1&1);
-+              # re-encode for upper double register addressing
-+              $_=($1|$1>>5)&31;
-+          }
-+      }
-+
-+      return  sprintf ".word\t0x%08x !%s",
-+                      0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
-+                      $ref;
-+    } else {
-+      return $ref;
-+    }
-+}
-+
-+sub unvis3 {
-+my ($mnemonic,$rs1,$rs2,$rd)=@_;
-+my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
-+my ($ref,$opf);
-+my %visopf = (        "addxc"         => 0x011,
-+              "addxccc"       => 0x013,
-+              "umulxhi"       => 0x016,
-+              "alignaddr"     => 0x018,
-+              "bmask"         => 0x019,
-+              "alignaddrl"    => 0x01a        );
-+
-+    $ref = "$mnemonic\t$rs1,$rs2,$rd";
-+
-+    if ($opf=$visopf{$mnemonic}) {
-+      foreach ($rs1,$rs2,$rd) {
-+          return $ref if (!/%([goli])([0-9])/);
-+          $_=$bias{$1}+$2;
-+      }
-+
-+      return  sprintf ".word\t0x%08x !%s",
-+                      0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
-+                      $ref;
-+    } else {
-+      return $ref;
-+    }
-+}
-+
-+sub unaes_round {     # 4-argument instructions
-+my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
-+my ($ref,$opf);
-+my %aesopf = (        "aes_eround01"  => 0,
-+              "aes_eround23"  => 1,
-+              "aes_dround01"  => 2,
-+              "aes_dround23"  => 3,
-+              "aes_eround01_l"=> 4,
-+              "aes_eround23_l"=> 5,
-+              "aes_dround01_l"=> 6,
-+              "aes_dround23_l"=> 7,
-+              "aes_kexpand1"  => 8    );
-+
-+    $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
-+
-+    if (defined($opf=$aesopf{$mnemonic})) {
-+      $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
-+      foreach ($rs1,$rs2,$rd) {
-+          return $ref if (!/%f([0-9]{1,2})/);
-+          $_=$1;
-+          if ($1>=32) {
-+              return $ref if ($1&1);
-+              # re-encode for upper double register addressing
-+              $_=($1|$1>>5)&31;
-+          }
-+      }
-+
-+      return  sprintf ".word\t0x%08x !%s",
-+                      2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
-+                      $ref;
-+    } else {
-+      return $ref;
-+    }
-+}
-+
-+sub unaes_kexpand {   # 3-argument instructions
-+my ($mnemonic,$rs1,$rs2,$rd)=@_;
-+my ($ref,$opf);
-+my %aesopf = (        "aes_kexpand0"  => 0x130,
-+              "aes_kexpand2"  => 0x131        );
-+
-+    $ref = "$mnemonic\t$rs1,$rs2,$rd";
-+
-+    if (defined($opf=$aesopf{$mnemonic})) {
-+      foreach ($rs1,$rs2,$rd) {
-+          return $ref if (!/%f([0-9]{1,2})/);
-+          $_=$1;
-+          if ($1>=32) {
-+              return $ref if ($1&1);
-+              # re-encode for upper double register addressing
-+              $_=($1|$1>>5)&31;
-+          }
-+      }
-+
-+      return  sprintf ".word\t0x%08x !%s",
-+                      2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
-+                      $ref;
-+    } else {
-+      return $ref;
-+    }
-+}
-+
-+sub uncamellia_f {    # 4-argument instructions
-+my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
-+my ($ref,$opf);
-+
-+    $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
-+
-+    if (1) {
-+      $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
-+      foreach ($rs1,$rs2,$rd) {
-+          return $ref if (!/%f([0-9]{1,2})/);
-+          $_=$1;

@@ Diff output truncated at 100000 characters. @@
This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.

Reply via email to