Re: [PowerPC] GCM optimization

George Wilson Wed, 11 Nov 2020 08:25:45 -0800

On Wed, Nov 11, 2020 at 02:17:41AM +0200, Maamoun TK wrote:
> I think I mislabeled the percentage of performance comparison, the new
> method achieved 27.7% reduction in time on POWER8 that corresponds to 37.9%
> increase in performance.


Hi Maamoun,

Many thanks to you and Niels.  We plan to test this on POWER9.

> 
> On Tue, Nov 10, 2020 at 6:25 AM Maamoun TK <maamoun...@googlemail.com>
> wrote:
> 
> > This implementation takes advantage of research made by Niels Möller to
> > optimize GCM on PowerPC, this optimization yields a +27.7% performance
> > boost on POWER8 over the previous implementation that was based on intel
> > documents. The performance comparison is made by processing 4 blocks per
> > loop without any further optimizations.
> > I made some documentations between the lines but I suggest writing a
> > document similar to the intel ones that go into more details and clarify
> > the preference of this method. I'm also curious if this method can also
> > make a difference in other architectures like ARM, I'm planning to try it
> > out for ARM to figure that out.
> > ---
> >  configure.ac              |   6 +-
> >  gcm.c                     |  49 +++--
> >  powerpc64/p8/gcm-hash.asm | 502
> > ++++++++++++++++++++++++++++++++++++++++++++++
> >  3 files changed, 542 insertions(+), 15 deletions(-)
> >  create mode 100644 powerpc64/p8/gcm-hash.asm
> >
> > diff --git a/configure.ac b/configure.ac
> > index 2a47f940..20f7cf74 100644
> > --- a/configure.ac
> > +++ b/configure.ac
> > @@ -497,7 +497,7 @@ asm_replace_list="aes-encrypt-internal.asm
> > aes-decrypt-internal.asm \
> >   sha3-permute.asm umac-nh.asm umac-nh-n.asm machine.m4"
> >
> >  # Assembler files which generate additional object files if they are used.
> > -asm_nettle_optional_list="gcm-hash8.asm cpuid.asm \
> > +asm_nettle_optional_list="gcm-hash.asm gcm-hash8.asm cpuid.asm \
> >    aes-encrypt-internal-2.asm aes-decrypt-internal-2.asm memxor-2.asm \
> >    chacha-3core.asm chacha-core-internal-2.asm salsa20-2core.asm \
> >    salsa20-core-internal-2.asm sha1-compress-2.asm sha256-compress-2.asm \
> > @@ -621,9 +621,9 @@ AH_VERBATIM([HAVE_NATIVE],
> >  #undef HAVE_NATIVE_ecc_secp384r1_redc
> >  #undef HAVE_NATIVE_ecc_secp521r1_modp
> >  #undef HAVE_NATIVE_ecc_secp521r1_redc
> > -#undef HAVE_NATIVE_gcm_init_key8
> > +#undef HAVE_NATIVE_gcm_init_key
> > +#undef HAVE_NATIVE_gcm_hash
> >  #undef HAVE_NATIVE_gcm_hash8
> > -#undef HAVE_NATIVE_gcm_fill
> >  #undef HAVE_NATIVE_salsa20_core
> >  #undef HAVE_NATIVE_salsa20_2core
> >  #undef HAVE_NATIVE_fat_salsa20_2core
> > diff --git a/gcm.c b/gcm.c
> > index 48b3e75a..81981c1c 100644
> > --- a/gcm.c
> > +++ b/gcm.c
> > @@ -140,6 +140,19 @@ gcm_gf_mul (union nettle_block16 *x, const union
> > nettle_block16 *table)
> >    memcpy (x->b, Z.b, sizeof(Z));
> >  }
> >  # elif GCM_TABLE_BITS == 8
> > +#  if HAVE_NATIVE_gcm_init_key
> > +
> > +#define gcm_init_key _nettle_gcm_init_key
> > +void
> > +_nettle_gcm_init_key (union nettle_block16 *table);
> > +#  endif /* HAVE_NATIVE_gcm_init_key */
> > +#  if HAVE_NATIVE_gcm_hash
> > +
> > +#define gcm_hash _nettle_gcm_hash
> > +void
> > +_nettle_gcm_hash (const struct gcm_key *key, union nettle_block16 *x,
> > +   size_t length, const uint8_t *data);
> > +#  endif /* HAVE_NATIVE_gcm_hash */
> >  #  if HAVE_NATIVE_gcm_hash8
> >
> >  #define gcm_hash _nettle_gcm_hash8
> > @@ -228,6 +241,29 @@ gcm_gf_mul (union nettle_block16 *x, const union
> > nettle_block16 *table)
> >  /* Increment the rightmost 32 bits. */
> >  #define INC32(block) INCREMENT(4, (block.b) + GCM_BLOCK_SIZE - 4)
> >
> > +#ifndef gcm_init_key
> > +static void
> > +gcm_init_key(union nettle_block16 *table)
> > +{
> > +#if GCM_TABLE_BITS
> > +  /* Middle element if GCM_TABLE_BITS > 0, otherwise the first
> > +     element */
> > +  unsigned i = (1<<GCM_TABLE_BITS)/2;
> > +
> > +  /* Algorithm 3 from the gcm paper. First do powers of two, then do
> > +     the rest by adding. */
> > +  while (i /= 2)
> > +    block16_mulx_ghash(&table[i], &table[2*i]);
> > +  for (i = 2; i < 1<<GCM_TABLE_BITS; i *= 2)
> > +    {
> > +      unsigned j;
> > +      for (j = 1; j < i; j++)
> > + block16_xor3(&table[i+j], &table[i], &table[j]);
> > +    }
> > +#endif
> > +}
> > +#endif /* !gcm_init_key */
> > +
> >  /* Initialization of GCM.
> >   * @ctx: The context of GCM
> >   * @cipher: The context of the underlying block cipher
> > @@ -245,18 +281,7 @@ gcm_set_key(struct gcm_key *key,
> >    memset(key->h[0].b, 0, GCM_BLOCK_SIZE);
> >    f (cipher, GCM_BLOCK_SIZE, key->h[i].b, key->h[0].b);
> >
> > -#if GCM_TABLE_BITS
> > -  /* Algorithm 3 from the gcm paper. First do powers of two, then do
> > -     the rest by adding. */
> > -  while (i /= 2)
> > -    block16_mulx_ghash(&key->h[i], &key->h[2*i]);
> > -  for (i = 2; i < 1<<GCM_TABLE_BITS; i *= 2)
> > -    {
> > -      unsigned j;
> > -      for (j = 1; j < i; j++)
> > - block16_xor3(&key->h[i+j], &key->h[i],&key->h[j]);
> > -    }
> > -#endif
> > +  gcm_init_key(key->h);
> >  }
> >
> >  #ifndef gcm_hash
> > diff --git a/powerpc64/p8/gcm-hash.asm b/powerpc64/p8/gcm-hash.asm
> > new file mode 100644
> > index 00000000..e79fbdc2
> > --- /dev/null
> > +++ b/powerpc64/p8/gcm-hash.asm
> > @@ -0,0 +1,502 @@
> > +C powerpc64/p8/gcm-hash.asm
> > +
> > +ifelse(`
> > +   Copyright (C) 2020 Niels Möller and Mamone Tarsha
> > +   This file is part of GNU Nettle.
> > +
> > +   GNU Nettle is free software: you can redistribute it and/or
> > +   modify it under the terms of either:
> > +
> > +     * the GNU Lesser General Public License as published by the Free
> > +       Software Foundation; either version 3 of the License, or (at your
> > +       option) any later version.
> > +
> > +   or
> > +
> > +     * the GNU General Public License as published by the Free
> > +       Software Foundation; either version 2 of the License, or (at your
> > +       option) any later version.
> > +
> > +   or both in parallel, as here.
> > +
> > +   GNU Nettle is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   General Public License for more details.
> > +
> > +   You should have received copies of the GNU General Public License and
> > +   the GNU Lesser General Public License along with this program.  If
> > +   not, see http://www.gnu.org/licenses/.
> > +')
> > +
> > +C Alignment of gcm_key table elements, which is declared in gcm.h
> > +define(`TableElemAlign', `0x100')
> > +
> > +C Register usage:
> > +
> > +define(`SP', `r1')
> > +define(`TOCP', `r2')
> > +
> > +define(`TABLE', `r3')
> > +
> > +define(`ZERO', `v0')
> > +define(`B1', `v1')
> > +define(`EMSB', `v16')
> > +define(`POLY', `v17')
> > +define(`POLY_L', `v1')
> > +
> > +define(`H', `v2')
> > +define(`H2', `v3')
> > +define(`H3', `v4')
> > +define(`H4', `v5')
> > +define(`H1M', `v6')
> > +define(`H1L', `v7')
> > +define(`H2M', `v8')
> > +define(`H2L', `v9')
> > +define(`Hl', `v10')
> > +define(`Hm', `v11')
> > +define(`Hp', `v12')
> > +define(`Hl2', `v13')
> > +define(`Hm2', `v14')
> > +define(`Hp2', `v15')
> > +define(`R', `v13')
> > +define(`F', `v14')
> > +define(`T', `v15')
> > +define(`R2', `v16')
> > +define(`F2', `v17')
> > +define(`T2', `v18')
> > +
> > +define(`LE_TEMP', `v18')
> > +define(`LE_MASK', `v19')
> > +
> > +.file "gcm-hash.asm"
> > +
> > +.text
> > +
> > +    C void gcm_init_key (union gcm_block *table)
> > +
> > +C This function populates the gcm table as the following layout
> > +C
> > *******************************************************************************
> > +C | H1M = (H1 div x⁶⁴)||((H1 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴
> >        |
> > +C | H1L = (H1 mod x⁶⁴)||(((H1 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H1
> > div x⁶⁴) |
> > +C |
> >       |
> > +C | H2M = (H2 div x⁶⁴)||((H2 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴
> >        |
> > +C | H2L = (H2 mod x⁶⁴)||(((H2 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H2
> > div x⁶⁴) |
> > +C |
> >       |
> > +C | H3M = (H3 div x⁶⁴)||((H3 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴
> >        |
> > +C | H3L = (H3 mod x⁶⁴)||(((H3 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H3
> > div x⁶⁴) |
> > +C |
> >       |
> > +C | H4M = (H3 div x⁶⁴)||((H4 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴
> >        |
> > +C | H4L = (H3 mod x⁶⁴)||(((H4 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H4
> > div x⁶⁴) |
> > +C
> > *******************************************************************************
> > +
> > +define(`FUNC_ALIGN', `5')
> > +PROLOGUE(_nettle_gcm_init_key)
> > +    DATA_LOAD_VEC(POLY,.polynomial,r7)           C
> > 0xC2000000000000000000000000000001
> > +IF_LE(`
> > +    li             r8,0
> > +    lvsl           LE_MASK,0,r8                  C
> > 0x000102030405060708090A0B0C0D0E0F
> > +    vspltisb       LE_TEMP,0x07                  C
> > 0x07070707070707070707070707070707
> > +    vxor           LE_MASK,LE_MASK,LE_TEMP       C
> > 0x07060504030201000F0E0D0C0B0A0908
> > +')
> > +
> > +    C 'H' is assigned by gcm_set_key() to the middle element of the table
> > +    li             r10,8*TableElemAlign
> > +    lxvd2x         VSR(H),r10,TABLE              C load 'H'
> > +    C byte-reverse of each doubleword permuting on little-endian mode
> > +IF_LE(`
> > +    vperm          H,H,H,LE_MASK
> > +')
> > +
> > +    C --- calculate H = H << 1 mod P(X), P(X) = (x¹²⁸+x¹²⁷+x¹²⁶+x¹²¹+1)
> > ---
> > +
> > +    vupkhsb        EMSB,H                        C extend most
> > significant bit to first byte
> > +    vspltisb       B1,1                          C
> > 0x01010101010101010101010101010101
> > +    vspltb         EMSB,EMSB,0                   C first byte
> > quadword-extend
> > +    vsl            H,H,B1                        C H = H << 1
> > +    vand           EMSB,EMSB,POLY                C EMSB &=
> > 0xC2000000000000000000000000000001
> > +    vxor           ZERO,ZERO,ZERO                C
> > 0x00000000000000000000000000000000
> > +    vxor           H,H,EMSB                      C H ^= EMSB
> > +
> > +    C --- calculate H^2 = H*H ---
> > +
> > +    xxmrghd        VSR(POLY_L),VSR(ZERO),VSR(POLY) C
> > 0x0000000000000000C200000000000000
> > +
> > +    C --- Hp = (H mod x⁶⁴) / x⁶⁴ mod P(X) ---
> > +    C --- Hp = (H mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷) mod P(X), deg(Hp) ≤ 127 ---
> > +    C --- Hp = (H mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷) ---
> > +    vpmsumd        Hp,H,POLY_L                   C Hp = (H mod x⁶⁴) ×
> > (x⁶³+x⁶²+x⁵⁷)
> > +    xxmrgld        VSR(Hl),VSR(H),VSR(ZERO)      C Hl = (H mod x⁶⁴) × x⁶⁴
> > +    xxswapd        VSR(Hm),VSR(H)
> > +    vxor           Hl,Hl,Hp                      C Hl = Hl + Hp
> > +    vxor           Hm,Hm,Hp                      C Hm = Hm + Hp
> > +    xxmrghd        VSR(H1M),VSR(H),VSR(Hl)       C H1M = (H div x⁶⁴)||(Hl
> > div x⁶⁴)
> > +    xxmrgld        VSR(H1L),VSR(H),VSR(Hm)       C H1L = (H mod x⁶⁴)||(Hl
> > mod x⁶⁴)
> > +
> > +    vpmsumd        F,H1L,H                       C F = (H1Lh × Hh) +
> > (H1Ll × Hl)
> > +    vpmsumd        R,H1M,H                       C R = (H1Mh × Hh) +
> > (H1Ml × Hl)
> > +
> > +    C --- rduction ---
> > +    vpmsumd        T,F,POLY_L                    C T = (F mod x⁶⁴) ×
> > (x⁶³+x⁶²+x⁵⁷)
> > +    xxswapd        VSR(H2),VSR(F)
> > +    vxor           R,R,T                         C R = R + T
> > +    vxor           H2,R,H2
> > +
> > +    xxmrgld        VSR(Hl),VSR(H2),VSR(ZERO)
> > +    xxswapd        VSR(Hm),VSR(H2)
> > +    vpmsumd        Hp,H2,POLY_L
> > +    vxor           Hl,Hl,Hp
> > +    vxor           Hm,Hm,Hp
> > +    xxmrghd        VSR(H2M),VSR(H2),VSR(Hl)
> > +    xxmrgld        VSR(H2L),VSR(H2),VSR(Hm)
> > +
> > +    C store H1M, H1L, H2M, H2L
> > +    li             r8,1*TableElemAlign
> > +    li             r9,2*TableElemAlign
> > +    li             r10,3*TableElemAlign
> > +    stxvd2x        VSR(H1M),0,TABLE
> > +    stxvd2x        VSR(H1L),r8,TABLE
> > +    stxvd2x        VSR(H2M),r9,TABLE
> > +    stxvd2x        VSR(H2L),r10,TABLE
> > +
> > +    C --- calculate H^3 = H^1*H^2, H^4 = H^2*H^2 ---
> > +
> > +    vpmsumd        F,H1L,H2
> > +    vpmsumd        F2,H2L,H2
> > +    vpmsumd        R,H1M,H2
> > +    vpmsumd        R2,H2M,H2
> > +
> > +    vpmsumd        T,F,POLY_L
> > +    vpmsumd        T2,F2,POLY_L
> > +    xxswapd        VSR(H3),VSR(F)
> > +    xxswapd        VSR(H4),VSR(F2)
> > +    vxor           R,R,T
> > +    vxor           R2,R2,T2
> > +    vxor           H3,R,H3
> > +    vxor           H4,R2,H4
> > +
> > +    xxmrgld        VSR(Hl),VSR(H3),VSR(ZERO)
> > +    xxmrgld        VSR(Hl2),VSR(H4),VSR(ZERO)
> > +    xxswapd        VSR(Hm),VSR(H3)
> > +    xxswapd        VSR(Hm2),VSR(H4)
> > +    vpmsumd        Hp,H3,POLY_L
> > +    vpmsumd        Hp2,H4,POLY_L
> > +    vxor           Hl,Hl,Hp
> > +    vxor           Hl2,Hl2,Hp2
> > +    vxor           Hm,Hm,Hp
> > +    vxor           Hm2,Hm2,Hp2
> > +    xxmrghd        VSR(H1M),VSR(H3),VSR(Hl)
> > +    xxmrghd        VSR(H2M),VSR(H4),VSR(Hl2)
> > +    xxmrgld        VSR(H1L),VSR(H3),VSR(Hm)
> > +    xxmrgld        VSR(H2L),VSR(H4),VSR(Hm2)
> > +
> > +    C store H3M, H3L, H4M, H4L
> > +    li             r7,4*TableElemAlign
> > +    li             r8,5*TableElemAlign
> > +    li             r9,6*TableElemAlign
> > +    li             r10,7*TableElemAlign
> > +    stxvd2x        VSR(H1M),r7,TABLE
> > +    stxvd2x        VSR(H1L),r8,TABLE
> > +    stxvd2x        VSR(H2M),r9,TABLE
> > +    stxvd2x        VSR(H2L),r10,TABLE
> > +
> > +    blr
> > +EPILOGUE(_nettle_gcm_init_key)
> > +
> > +define(`TABLE', `r3')
> > +define(`X', `r4')
> > +define(`LENGTH', `r5')
> > +define(`DATA', `r6')
> > +
> > +define(`ZERO', `v16')
> > +define(`POLY', `v17')
> > +define(`POLY_L', `v0')
> > +
> > +define(`D', `v1')
> > +define(`C0', `v2')
> > +define(`C1', `v3')
> > +define(`C2', `v4')
> > +define(`C3', `v5')
> > +define(`H1M', `v6')
> > +define(`H1L', `v7')
> > +define(`H2M', `v8')
> > +define(`H2L', `v9')
> > +define(`H3M', `v10')
> > +define(`H3L', `v11')
> > +define(`H4M', `v12')
> > +define(`H4L', `v13')
> > +define(`R', `v14')
> > +define(`F', `v15')
> > +define(`R2', `v16')
> > +define(`F2', `v17')
> > +define(`R3', `v18')
> > +define(`F3', `v20')
> > +define(`R4', `v21')
> > +define(`F4', `v22')
> > +define(`T', `v23')
> > +
> > +define(`LE_TEMP', `v18')
> > +define(`LE_MASK', `v19')
> > +
> > +    C void gcm_hash (const struct gcm_key *key, union gcm_block *x,
> > +    C                size_t length, const uint8_t *data)
> > +
> > +define(`FUNC_ALIGN', `5')
> > +PROLOGUE(_nettle_gcm_hash)
> > +    DATA_LOAD_VEC(POLY,.polynomial,r7)
> > +IF_LE(`
> > +    li             r8,0
> > +    lvsl           LE_MASK,0,r8
> > +    vspltisb       LE_TEMP,0x07
> > +    vxor           LE_MASK,LE_MASK,LE_TEMP
> > +')
> > +    vxor           ZERO,ZERO,ZERO
> > +    xxmrghd        VSR(POLY_L),VSR(ZERO),VSR(POLY)
> > +
> > +    lxvd2x         VSR(D),0,X                    C load 'X' pointer
> > +    C byte-reverse of each doubleword permuting on little-endian mode
> > +IF_LE(`
> > +    vperm          D,D,D,LE_MASK
> > +')
> > +
> > +    C --- process 4 blocks '128-bit each' per one loop ---
> > +
> > +    srdi           r7,LENGTH,6                   C 4-blocks loop count
> > 'LENGTH / (4 * 16)'
> > +    cmpldi         r7,0
> > +    beq            L2x
> > +
> > +    mtctr          r7                            C assign counter
> > register to loop count
> > +
> > +    C store non-volatile vector registers
> > +    addi           r8,SP,-64
> > +    stvx           20,0,r8
> > +    addi           r8,r8,16
> > +    stvx           21,0,r8
> > +    addi           r8,r8,16
> > +    stvx           22,0,r8
> > +    addi           r8,r8,16
> > +    stvx           23,0,r8
> > +
> > +    C load table elements
> > +    li             r8,1*TableElemAlign
> > +    li             r9,2*TableElemAlign
> > +    li             r10,3*TableElemAlign
> > +    lxvd2x         VSR(H1M),0,TABLE
> > +    lxvd2x         VSR(H1L),r8,TABLE
> > +    lxvd2x         VSR(H2M),r9,TABLE
> > +    lxvd2x         VSR(H2L),r10,TABLE
> > +    li             r7,4*TableElemAlign
> > +    li             r8,5*TableElemAlign
> > +    li             r9,6*TableElemAlign
> > +    li             r10,7*TableElemAlign
> > +    lxvd2x         VSR(H3M),r7,TABLE
> > +    lxvd2x         VSR(H3L),r8,TABLE
> > +    lxvd2x         VSR(H4M),r9,TABLE
> > +    lxvd2x         VSR(H4L),r10,TABLE
> > +
> > +    li             r8,0x10
> > +    li             r9,0x20
> > +    li             r10,0x30
> > +.align 5
> > +L4x_loop:
> > +    C input loading
> > +    lxvd2x         VSR(C0),0,DATA                C load C0
> > +    lxvd2x         VSR(C1),r8,DATA               C load C1
> > +    lxvd2x         VSR(C2),r9,DATA               C load C2
> > +    lxvd2x         VSR(C3),r10,DATA              C load C3
> > +
> > +IF_LE(`
> > +    vperm          C0,C0,C0,LE_MASK
> > +    vperm          C1,C1,C1,LE_MASK
> > +    vperm          C2,C2,C2,LE_MASK
> > +    vperm          C3,C3,C3,LE_MASK
> > +')
> > +
> > +    C previous digest combining
> > +    vxor           C0,C0,D
> > +
> > +    C polynomial multiplication
> > +    vpmsumd        F2,H3L,C1
> > +    vpmsumd        R2,H3M,C1
> > +    vpmsumd        F3,H2L,C2
> > +    vpmsumd        R3,H2M,C2
> > +    vpmsumd        F4,H1L,C3
> > +    vpmsumd        R4,H1M,C3
> > +    vpmsumd        F,H4L,C0
> > +    vpmsumd        R,H4M,C0
> > +
> > +    C deferred recombination of partial products
> > +    vxor           F3,F3,F4
> > +    vxor           R3,R3,R4
> > +    vxor           F,F,F2
> > +    vxor           R,R,R2
> > +    vxor           F,F,F3
> > +    vxor           R,R,R3
> > +
> > +    C reduction
> > +    vpmsumd        T,F,POLY_L
> > +    xxswapd        VSR(D),VSR(F)
> > +    vxor           R,R,T
> > +    vxor           D,R,D
> > +
> > +    addi           DATA,DATA,0x40
> > +    bdnz           L4x_loop
> > +
> > +    C restore non-volatile vector registers
> > +    addi           r8,SP,-64
> > +    lvx            20,0,r8
> > +    addi           r8,r8,16
> > +    lvx            21,0,r8
> > +    addi           r8,r8,16
> > +    lvx            22,0,r8
> > +    addi           r8,r8,16
> > +    lvx            23,0,r8
> > +
> > +    clrldi         LENGTH,LENGTH,58              C 'set the high-order 58
> > bits to zeros'
> > +L2x:
> > +    C --- process 2 blocks ---
> > +
> > +    srdi           r7,LENGTH,5                   C 'LENGTH / (2 * 16)'
> > +    cmpldi         r7,0
> > +    beq            L1x
> > +
> > +    C load table elements
> > +    li             r8,1*TableElemAlign
> > +    li             r9,2*TableElemAlign
> > +    li             r10,3*TableElemAlign
> > +    lxvd2x         VSR(H1M),0,TABLE
> > +    lxvd2x         VSR(H1L),r8,TABLE
> > +    lxvd2x         VSR(H2M),r9,TABLE
> > +    lxvd2x         VSR(H2L),r10,TABLE
> > +
> > +    C input loading
> > +    li             r10,0x10
> > +    lxvd2x         VSR(C0),0,DATA                C load C0
> > +    lxvd2x         VSR(C1),r10,DATA              C load C1
> > +
> > +IF_LE(`
> > +    vperm          C0,C0,C0,LE_MASK
> > +    vperm          C1,C1,C1,LE_MASK
> > +')
> > +
> > +    C previous digest combining
> > +    vxor           C0,C0,D
> > +
> > +    C polynomial multiplication
> > +    vpmsumd        F2,H1L,C1
> > +    vpmsumd        R2,H1M,C1
> > +    vpmsumd        F,H2L,C0
> > +    vpmsumd        R,H2M,C0
> > +
> > +    C deferred recombination of partial products
> > +    vxor           F,F,F2
> > +    vxor           R,R,R2
> > +
> > +    C reduction
> > +    vpmsumd        T,F,POLY_L
> > +    xxswapd        VSR(D),VSR(F)
> > +    vxor           R,R,T
> > +    vxor           D,R,D
> > +
> > +    addi           DATA,DATA,0x20
> > +    clrldi         LENGTH,LENGTH,59              C 'set the high-order 59
> > bits to zeros'
> > +L1x:
> > +    C --- process 1 block ---
> > +
> > +    srdi           r7,LENGTH,4                   C 'LENGTH / (1 * 16)'
> > +    cmpldi         r7,0
> > +    beq            Lmod
> > +
> > +    C load table elements
> > +    li             r8,1*TableElemAlign
> > +    lxvd2x         VSR(H1M),0,TABLE
> > +    lxvd2x         VSR(H1L),r8,TABLE
> > +
> > +    C input loading
> > +    lxvd2x         VSR(C0),0,DATA                C load C0
> > +
> > +IF_LE(`
> > +    vperm          C0,C0,C0,LE_MASK
> > +')
> > +
> > +    C previous digest combining
> > +    vxor           C0,C0,D
> > +
> > +    C polynomial multiplication
> > +    vpmsumd        F,H1L,C0
> > +    vpmsumd        R,H1M,C0
> > +
> > +    C reduction
> > +    vpmsumd        T,F,POLY_L
> > +    xxswapd        VSR(D),VSR(F)
> > +    vxor           R,R,T
> > +    vxor           D,R,D
> > +
> > +    addi           DATA,DATA,0x10
> > +    clrldi         LENGTH,LENGTH,60              C 'set the high-order 60
> > bits to zeros'
> > +Lmod:
> > +    C --- process the modulo bytes, padding the low-order bytes with
> > zeros ---
> > +
> > +    cmpldi         LENGTH,0
> > +    beq            Ldone
> > +
> > +    C load table elements
> > +    li             r8,1*TableElemAlign
> > +    lxvd2x         VSR(H1M),0,TABLE
> > +    lxvd2x         VSR(H1L),r8,TABLE
> > +
> > +    C push every modulo byte to the stack and load them with padding into
> > vector register
> > +    vxor           ZERO,ZERO,ZERO
> > +    addi           r8,SP,-16
> > +    stvx           ZERO,0,r8
> > +Lstb_loop:
> > +    subic.         LENGTH,LENGTH,1
> > +    lbzx           r7,LENGTH,DATA
> > +    stbx           r7,LENGTH,r8
> > +    bne            Lstb_loop
> > +    lxvd2x         VSR(C0),0,r8
> > +
> > +IF_LE(`
> > +    vperm          C0,C0,C0,LE_MASK
> > +')
> > +
> > +    C previous digest combining
> > +    vxor           C0,C0,D
> > +
> > +    C polynomial multiplication
> > +    vpmsumd        F,H1L,C0
> > +    vpmsumd        R,H1M,C0
> > +
> > +    C reduction
> > +    vpmsumd        T,F,POLY_L
> > +    xxswapd        VSR(D),VSR(F)
> > +    vxor           R,R,T
> > +    vxor           D,R,D
> > +
> > +Ldone:
> > +    C byte-reverse of each doubleword permuting on little-endian mode
> > +IF_LE(`
> > +    vperm          D,D,D,LE_MASK
> > +')
> > +    stxvd2x        VSR(D),0,X                    C store digest 'D'
> > +
> > +    blr
> > +EPILOGUE(_nettle_gcm_hash)
> > +
> > +.data
> > +    C 0xC2000000000000000000000000000001
> > +.polynomial:
> > +.align 4
> > +IF_BE(`
> > +.byte 0xC2
> > +.rept 14
> > +.byte 0x00
> > +.endr
> > +.byte 0x01
> > +',`
> > +.byte 0x01
> > +.rept 14
> > +.byte 0x00
> > +.endr
> > +.byte 0xC2
> > +')
> >
> > --
> > 2.17.1
> >
> _______________________________________________
> nettle-bugs mailing list
> nettle-bugs@lists.lysator.liu.se
> http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs

-- 
George Wilson
IBM Linux Technology Center
Security Development
_______________________________________________
nettle-bugs mailing list
nettle-bugs@lists.lysator.liu.se
http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs

Re: [PowerPC] GCM optimization

Reply via email to