On Wed, Nov 11, 2020 at 02:17:41AM +0200, Maamoun TK wrote: > I think I mislabeled the percentage of performance comparison, the new > method achieved 27.7% reduction in time on POWER8 that corresponds to 37.9% > increase in performance.
Hi Maamoun, Many thanks to you and Niels. We plan to test this on POWER9. > > On Tue, Nov 10, 2020 at 6:25 AM Maamoun TK <maamoun...@googlemail.com> > wrote: > > > This implementation takes advantage of research made by Niels Möller to > > optimize GCM on PowerPC, this optimization yields a +27.7% performance > > boost on POWER8 over the previous implementation that was based on intel > > documents. The performance comparison is made by processing 4 blocks per > > loop without any further optimizations. > > I made some documentations between the lines but I suggest writing a > > document similar to the intel ones that go into more details and clarify > > the preference of this method. I'm also curious if this method can also > > make a difference in other architectures like ARM, I'm planning to try it > > out for ARM to figure that out. > > --- > > configure.ac | 6 +- > > gcm.c | 49 +++-- > > powerpc64/p8/gcm-hash.asm | 502 > > ++++++++++++++++++++++++++++++++++++++++++++++ > > 3 files changed, 542 insertions(+), 15 deletions(-) > > create mode 100644 powerpc64/p8/gcm-hash.asm > > > > diff --git a/configure.ac b/configure.ac > > index 2a47f940..20f7cf74 100644 > > --- a/configure.ac > > +++ b/configure.ac > > @@ -497,7 +497,7 @@ asm_replace_list="aes-encrypt-internal.asm > > aes-decrypt-internal.asm \ > > sha3-permute.asm umac-nh.asm umac-nh-n.asm machine.m4" > > > > # Assembler files which generate additional object files if they are used. > > -asm_nettle_optional_list="gcm-hash8.asm cpuid.asm \ > > +asm_nettle_optional_list="gcm-hash.asm gcm-hash8.asm cpuid.asm \ > > aes-encrypt-internal-2.asm aes-decrypt-internal-2.asm memxor-2.asm \ > > chacha-3core.asm chacha-core-internal-2.asm salsa20-2core.asm \ > > salsa20-core-internal-2.asm sha1-compress-2.asm sha256-compress-2.asm \ > > @@ -621,9 +621,9 @@ AH_VERBATIM([HAVE_NATIVE], > > #undef HAVE_NATIVE_ecc_secp384r1_redc > > #undef HAVE_NATIVE_ecc_secp521r1_modp > > #undef HAVE_NATIVE_ecc_secp521r1_redc > > -#undef HAVE_NATIVE_gcm_init_key8 > > +#undef HAVE_NATIVE_gcm_init_key > > +#undef HAVE_NATIVE_gcm_hash > > #undef HAVE_NATIVE_gcm_hash8 > > -#undef HAVE_NATIVE_gcm_fill > > #undef HAVE_NATIVE_salsa20_core > > #undef HAVE_NATIVE_salsa20_2core > > #undef HAVE_NATIVE_fat_salsa20_2core > > diff --git a/gcm.c b/gcm.c > > index 48b3e75a..81981c1c 100644 > > --- a/gcm.c > > +++ b/gcm.c > > @@ -140,6 +140,19 @@ gcm_gf_mul (union nettle_block16 *x, const union > > nettle_block16 *table) > > memcpy (x->b, Z.b, sizeof(Z)); > > } > > # elif GCM_TABLE_BITS == 8 > > +# if HAVE_NATIVE_gcm_init_key > > + > > +#define gcm_init_key _nettle_gcm_init_key > > +void > > +_nettle_gcm_init_key (union nettle_block16 *table); > > +# endif /* HAVE_NATIVE_gcm_init_key */ > > +# if HAVE_NATIVE_gcm_hash > > + > > +#define gcm_hash _nettle_gcm_hash > > +void > > +_nettle_gcm_hash (const struct gcm_key *key, union nettle_block16 *x, > > + size_t length, const uint8_t *data); > > +# endif /* HAVE_NATIVE_gcm_hash */ > > # if HAVE_NATIVE_gcm_hash8 > > > > #define gcm_hash _nettle_gcm_hash8 > > @@ -228,6 +241,29 @@ gcm_gf_mul (union nettle_block16 *x, const union > > nettle_block16 *table) > > /* Increment the rightmost 32 bits. */ > > #define INC32(block) INCREMENT(4, (block.b) + GCM_BLOCK_SIZE - 4) > > > > +#ifndef gcm_init_key > > +static void > > +gcm_init_key(union nettle_block16 *table) > > +{ > > +#if GCM_TABLE_BITS > > + /* Middle element if GCM_TABLE_BITS > 0, otherwise the first > > + element */ > > + unsigned i = (1<<GCM_TABLE_BITS)/2; > > + > > + /* Algorithm 3 from the gcm paper. First do powers of two, then do > > + the rest by adding. */ > > + while (i /= 2) > > + block16_mulx_ghash(&table[i], &table[2*i]); > > + for (i = 2; i < 1<<GCM_TABLE_BITS; i *= 2) > > + { > > + unsigned j; > > + for (j = 1; j < i; j++) > > + block16_xor3(&table[i+j], &table[i], &table[j]); > > + } > > +#endif > > +} > > +#endif /* !gcm_init_key */ > > + > > /* Initialization of GCM. > > * @ctx: The context of GCM > > * @cipher: The context of the underlying block cipher > > @@ -245,18 +281,7 @@ gcm_set_key(struct gcm_key *key, > > memset(key->h[0].b, 0, GCM_BLOCK_SIZE); > > f (cipher, GCM_BLOCK_SIZE, key->h[i].b, key->h[0].b); > > > > -#if GCM_TABLE_BITS > > - /* Algorithm 3 from the gcm paper. First do powers of two, then do > > - the rest by adding. */ > > - while (i /= 2) > > - block16_mulx_ghash(&key->h[i], &key->h[2*i]); > > - for (i = 2; i < 1<<GCM_TABLE_BITS; i *= 2) > > - { > > - unsigned j; > > - for (j = 1; j < i; j++) > > - block16_xor3(&key->h[i+j], &key->h[i],&key->h[j]); > > - } > > -#endif > > + gcm_init_key(key->h); > > } > > > > #ifndef gcm_hash > > diff --git a/powerpc64/p8/gcm-hash.asm b/powerpc64/p8/gcm-hash.asm > > new file mode 100644 > > index 00000000..e79fbdc2 > > --- /dev/null > > +++ b/powerpc64/p8/gcm-hash.asm > > @@ -0,0 +1,502 @@ > > +C powerpc64/p8/gcm-hash.asm > > + > > +ifelse(` > > + Copyright (C) 2020 Niels Möller and Mamone Tarsha > > + This file is part of GNU Nettle. > > + > > + GNU Nettle is free software: you can redistribute it and/or > > + modify it under the terms of either: > > + > > + * the GNU Lesser General Public License as published by the Free > > + Software Foundation; either version 3 of the License, or (at your > > + option) any later version. > > + > > + or > > + > > + * the GNU General Public License as published by the Free > > + Software Foundation; either version 2 of the License, or (at your > > + option) any later version. > > + > > + or both in parallel, as here. > > + > > + GNU Nettle is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + General Public License for more details. > > + > > + You should have received copies of the GNU General Public License and > > + the GNU Lesser General Public License along with this program. If > > + not, see http://www.gnu.org/licenses/. > > +') > > + > > +C Alignment of gcm_key table elements, which is declared in gcm.h > > +define(`TableElemAlign', `0x100') > > + > > +C Register usage: > > + > > +define(`SP', `r1') > > +define(`TOCP', `r2') > > + > > +define(`TABLE', `r3') > > + > > +define(`ZERO', `v0') > > +define(`B1', `v1') > > +define(`EMSB', `v16') > > +define(`POLY', `v17') > > +define(`POLY_L', `v1') > > + > > +define(`H', `v2') > > +define(`H2', `v3') > > +define(`H3', `v4') > > +define(`H4', `v5') > > +define(`H1M', `v6') > > +define(`H1L', `v7') > > +define(`H2M', `v8') > > +define(`H2L', `v9') > > +define(`Hl', `v10') > > +define(`Hm', `v11') > > +define(`Hp', `v12') > > +define(`Hl2', `v13') > > +define(`Hm2', `v14') > > +define(`Hp2', `v15') > > +define(`R', `v13') > > +define(`F', `v14') > > +define(`T', `v15') > > +define(`R2', `v16') > > +define(`F2', `v17') > > +define(`T2', `v18') > > + > > +define(`LE_TEMP', `v18') > > +define(`LE_MASK', `v19') > > + > > +.file "gcm-hash.asm" > > + > > +.text > > + > > + C void gcm_init_key (union gcm_block *table) > > + > > +C This function populates the gcm table as the following layout > > +C > > ******************************************************************************* > > +C | H1M = (H1 div x⁶⁴)||((H1 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴ > > | > > +C | H1L = (H1 mod x⁶⁴)||(((H1 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H1 > > div x⁶⁴) | > > +C | > > | > > +C | H2M = (H2 div x⁶⁴)||((H2 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴ > > | > > +C | H2L = (H2 mod x⁶⁴)||(((H2 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H2 > > div x⁶⁴) | > > +C | > > | > > +C | H3M = (H3 div x⁶⁴)||((H3 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴ > > | > > +C | H3L = (H3 mod x⁶⁴)||(((H3 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H3 > > div x⁶⁴) | > > +C | > > | > > +C | H4M = (H3 div x⁶⁴)||((H4 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴ > > | > > +C | H4L = (H3 mod x⁶⁴)||(((H4 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H4 > > div x⁶⁴) | > > +C > > ******************************************************************************* > > + > > +define(`FUNC_ALIGN', `5') > > +PROLOGUE(_nettle_gcm_init_key) > > + DATA_LOAD_VEC(POLY,.polynomial,r7) C > > 0xC2000000000000000000000000000001 > > +IF_LE(` > > + li r8,0 > > + lvsl LE_MASK,0,r8 C > > 0x000102030405060708090A0B0C0D0E0F > > + vspltisb LE_TEMP,0x07 C > > 0x07070707070707070707070707070707 > > + vxor LE_MASK,LE_MASK,LE_TEMP C > > 0x07060504030201000F0E0D0C0B0A0908 > > +') > > + > > + C 'H' is assigned by gcm_set_key() to the middle element of the table > > + li r10,8*TableElemAlign > > + lxvd2x VSR(H),r10,TABLE C load 'H' > > + C byte-reverse of each doubleword permuting on little-endian mode > > +IF_LE(` > > + vperm H,H,H,LE_MASK > > +') > > + > > + C --- calculate H = H << 1 mod P(X), P(X) = (x¹²⁸+x¹²⁷+x¹²⁶+x¹²¹+1) > > --- > > + > > + vupkhsb EMSB,H C extend most > > significant bit to first byte > > + vspltisb B1,1 C > > 0x01010101010101010101010101010101 > > + vspltb EMSB,EMSB,0 C first byte > > quadword-extend > > + vsl H,H,B1 C H = H << 1 > > + vand EMSB,EMSB,POLY C EMSB &= > > 0xC2000000000000000000000000000001 > > + vxor ZERO,ZERO,ZERO C > > 0x00000000000000000000000000000000 > > + vxor H,H,EMSB C H ^= EMSB > > + > > + C --- calculate H^2 = H*H --- > > + > > + xxmrghd VSR(POLY_L),VSR(ZERO),VSR(POLY) C > > 0x0000000000000000C200000000000000 > > + > > + C --- Hp = (H mod x⁶⁴) / x⁶⁴ mod P(X) --- > > + C --- Hp = (H mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷) mod P(X), deg(Hp) ≤ 127 --- > > + C --- Hp = (H mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷) --- > > + vpmsumd Hp,H,POLY_L C Hp = (H mod x⁶⁴) × > > (x⁶³+x⁶²+x⁵⁷) > > + xxmrgld VSR(Hl),VSR(H),VSR(ZERO) C Hl = (H mod x⁶⁴) × x⁶⁴ > > + xxswapd VSR(Hm),VSR(H) > > + vxor Hl,Hl,Hp C Hl = Hl + Hp > > + vxor Hm,Hm,Hp C Hm = Hm + Hp > > + xxmrghd VSR(H1M),VSR(H),VSR(Hl) C H1M = (H div x⁶⁴)||(Hl > > div x⁶⁴) > > + xxmrgld VSR(H1L),VSR(H),VSR(Hm) C H1L = (H mod x⁶⁴)||(Hl > > mod x⁶⁴) > > + > > + vpmsumd F,H1L,H C F = (H1Lh × Hh) + > > (H1Ll × Hl) > > + vpmsumd R,H1M,H C R = (H1Mh × Hh) + > > (H1Ml × Hl) > > + > > + C --- rduction --- > > + vpmsumd T,F,POLY_L C T = (F mod x⁶⁴) × > > (x⁶³+x⁶²+x⁵⁷) > > + xxswapd VSR(H2),VSR(F) > > + vxor R,R,T C R = R + T > > + vxor H2,R,H2 > > + > > + xxmrgld VSR(Hl),VSR(H2),VSR(ZERO) > > + xxswapd VSR(Hm),VSR(H2) > > + vpmsumd Hp,H2,POLY_L > > + vxor Hl,Hl,Hp > > + vxor Hm,Hm,Hp > > + xxmrghd VSR(H2M),VSR(H2),VSR(Hl) > > + xxmrgld VSR(H2L),VSR(H2),VSR(Hm) > > + > > + C store H1M, H1L, H2M, H2L > > + li r8,1*TableElemAlign > > + li r9,2*TableElemAlign > > + li r10,3*TableElemAlign > > + stxvd2x VSR(H1M),0,TABLE > > + stxvd2x VSR(H1L),r8,TABLE > > + stxvd2x VSR(H2M),r9,TABLE > > + stxvd2x VSR(H2L),r10,TABLE > > + > > + C --- calculate H^3 = H^1*H^2, H^4 = H^2*H^2 --- > > + > > + vpmsumd F,H1L,H2 > > + vpmsumd F2,H2L,H2 > > + vpmsumd R,H1M,H2 > > + vpmsumd R2,H2M,H2 > > + > > + vpmsumd T,F,POLY_L > > + vpmsumd T2,F2,POLY_L > > + xxswapd VSR(H3),VSR(F) > > + xxswapd VSR(H4),VSR(F2) > > + vxor R,R,T > > + vxor R2,R2,T2 > > + vxor H3,R,H3 > > + vxor H4,R2,H4 > > + > > + xxmrgld VSR(Hl),VSR(H3),VSR(ZERO) > > + xxmrgld VSR(Hl2),VSR(H4),VSR(ZERO) > > + xxswapd VSR(Hm),VSR(H3) > > + xxswapd VSR(Hm2),VSR(H4) > > + vpmsumd Hp,H3,POLY_L > > + vpmsumd Hp2,H4,POLY_L > > + vxor Hl,Hl,Hp > > + vxor Hl2,Hl2,Hp2 > > + vxor Hm,Hm,Hp > > + vxor Hm2,Hm2,Hp2 > > + xxmrghd VSR(H1M),VSR(H3),VSR(Hl) > > + xxmrghd VSR(H2M),VSR(H4),VSR(Hl2) > > + xxmrgld VSR(H1L),VSR(H3),VSR(Hm) > > + xxmrgld VSR(H2L),VSR(H4),VSR(Hm2) > > + > > + C store H3M, H3L, H4M, H4L > > + li r7,4*TableElemAlign > > + li r8,5*TableElemAlign > > + li r9,6*TableElemAlign > > + li r10,7*TableElemAlign > > + stxvd2x VSR(H1M),r7,TABLE > > + stxvd2x VSR(H1L),r8,TABLE > > + stxvd2x VSR(H2M),r9,TABLE > > + stxvd2x VSR(H2L),r10,TABLE > > + > > + blr > > +EPILOGUE(_nettle_gcm_init_key) > > + > > +define(`TABLE', `r3') > > +define(`X', `r4') > > +define(`LENGTH', `r5') > > +define(`DATA', `r6') > > + > > +define(`ZERO', `v16') > > +define(`POLY', `v17') > > +define(`POLY_L', `v0') > > + > > +define(`D', `v1') > > +define(`C0', `v2') > > +define(`C1', `v3') > > +define(`C2', `v4') > > +define(`C3', `v5') > > +define(`H1M', `v6') > > +define(`H1L', `v7') > > +define(`H2M', `v8') > > +define(`H2L', `v9') > > +define(`H3M', `v10') > > +define(`H3L', `v11') > > +define(`H4M', `v12') > > +define(`H4L', `v13') > > +define(`R', `v14') > > +define(`F', `v15') > > +define(`R2', `v16') > > +define(`F2', `v17') > > +define(`R3', `v18') > > +define(`F3', `v20') > > +define(`R4', `v21') > > +define(`F4', `v22') > > +define(`T', `v23') > > + > > +define(`LE_TEMP', `v18') > > +define(`LE_MASK', `v19') > > + > > + C void gcm_hash (const struct gcm_key *key, union gcm_block *x, > > + C size_t length, const uint8_t *data) > > + > > +define(`FUNC_ALIGN', `5') > > +PROLOGUE(_nettle_gcm_hash) > > + DATA_LOAD_VEC(POLY,.polynomial,r7) > > +IF_LE(` > > + li r8,0 > > + lvsl LE_MASK,0,r8 > > + vspltisb LE_TEMP,0x07 > > + vxor LE_MASK,LE_MASK,LE_TEMP > > +') > > + vxor ZERO,ZERO,ZERO > > + xxmrghd VSR(POLY_L),VSR(ZERO),VSR(POLY) > > + > > + lxvd2x VSR(D),0,X C load 'X' pointer > > + C byte-reverse of each doubleword permuting on little-endian mode > > +IF_LE(` > > + vperm D,D,D,LE_MASK > > +') > > + > > + C --- process 4 blocks '128-bit each' per one loop --- > > + > > + srdi r7,LENGTH,6 C 4-blocks loop count > > 'LENGTH / (4 * 16)' > > + cmpldi r7,0 > > + beq L2x > > + > > + mtctr r7 C assign counter > > register to loop count > > + > > + C store non-volatile vector registers > > + addi r8,SP,-64 > > + stvx 20,0,r8 > > + addi r8,r8,16 > > + stvx 21,0,r8 > > + addi r8,r8,16 > > + stvx 22,0,r8 > > + addi r8,r8,16 > > + stvx 23,0,r8 > > + > > + C load table elements > > + li r8,1*TableElemAlign > > + li r9,2*TableElemAlign > > + li r10,3*TableElemAlign > > + lxvd2x VSR(H1M),0,TABLE > > + lxvd2x VSR(H1L),r8,TABLE > > + lxvd2x VSR(H2M),r9,TABLE > > + lxvd2x VSR(H2L),r10,TABLE > > + li r7,4*TableElemAlign > > + li r8,5*TableElemAlign > > + li r9,6*TableElemAlign > > + li r10,7*TableElemAlign > > + lxvd2x VSR(H3M),r7,TABLE > > + lxvd2x VSR(H3L),r8,TABLE > > + lxvd2x VSR(H4M),r9,TABLE > > + lxvd2x VSR(H4L),r10,TABLE > > + > > + li r8,0x10 > > + li r9,0x20 > > + li r10,0x30 > > +.align 5 > > +L4x_loop: > > + C input loading > > + lxvd2x VSR(C0),0,DATA C load C0 > > + lxvd2x VSR(C1),r8,DATA C load C1 > > + lxvd2x VSR(C2),r9,DATA C load C2 > > + lxvd2x VSR(C3),r10,DATA C load C3 > > + > > +IF_LE(` > > + vperm C0,C0,C0,LE_MASK > > + vperm C1,C1,C1,LE_MASK > > + vperm C2,C2,C2,LE_MASK > > + vperm C3,C3,C3,LE_MASK > > +') > > + > > + C previous digest combining > > + vxor C0,C0,D > > + > > + C polynomial multiplication > > + vpmsumd F2,H3L,C1 > > + vpmsumd R2,H3M,C1 > > + vpmsumd F3,H2L,C2 > > + vpmsumd R3,H2M,C2 > > + vpmsumd F4,H1L,C3 > > + vpmsumd R4,H1M,C3 > > + vpmsumd F,H4L,C0 > > + vpmsumd R,H4M,C0 > > + > > + C deferred recombination of partial products > > + vxor F3,F3,F4 > > + vxor R3,R3,R4 > > + vxor F,F,F2 > > + vxor R,R,R2 > > + vxor F,F,F3 > > + vxor R,R,R3 > > + > > + C reduction > > + vpmsumd T,F,POLY_L > > + xxswapd VSR(D),VSR(F) > > + vxor R,R,T > > + vxor D,R,D > > + > > + addi DATA,DATA,0x40 > > + bdnz L4x_loop > > + > > + C restore non-volatile vector registers > > + addi r8,SP,-64 > > + lvx 20,0,r8 > > + addi r8,r8,16 > > + lvx 21,0,r8 > > + addi r8,r8,16 > > + lvx 22,0,r8 > > + addi r8,r8,16 > > + lvx 23,0,r8 > > + > > + clrldi LENGTH,LENGTH,58 C 'set the high-order 58 > > bits to zeros' > > +L2x: > > + C --- process 2 blocks --- > > + > > + srdi r7,LENGTH,5 C 'LENGTH / (2 * 16)' > > + cmpldi r7,0 > > + beq L1x > > + > > + C load table elements > > + li r8,1*TableElemAlign > > + li r9,2*TableElemAlign > > + li r10,3*TableElemAlign > > + lxvd2x VSR(H1M),0,TABLE > > + lxvd2x VSR(H1L),r8,TABLE > > + lxvd2x VSR(H2M),r9,TABLE > > + lxvd2x VSR(H2L),r10,TABLE > > + > > + C input loading > > + li r10,0x10 > > + lxvd2x VSR(C0),0,DATA C load C0 > > + lxvd2x VSR(C1),r10,DATA C load C1 > > + > > +IF_LE(` > > + vperm C0,C0,C0,LE_MASK > > + vperm C1,C1,C1,LE_MASK > > +') > > + > > + C previous digest combining > > + vxor C0,C0,D > > + > > + C polynomial multiplication > > + vpmsumd F2,H1L,C1 > > + vpmsumd R2,H1M,C1 > > + vpmsumd F,H2L,C0 > > + vpmsumd R,H2M,C0 > > + > > + C deferred recombination of partial products > > + vxor F,F,F2 > > + vxor R,R,R2 > > + > > + C reduction > > + vpmsumd T,F,POLY_L > > + xxswapd VSR(D),VSR(F) > > + vxor R,R,T > > + vxor D,R,D > > + > > + addi DATA,DATA,0x20 > > + clrldi LENGTH,LENGTH,59 C 'set the high-order 59 > > bits to zeros' > > +L1x: > > + C --- process 1 block --- > > + > > + srdi r7,LENGTH,4 C 'LENGTH / (1 * 16)' > > + cmpldi r7,0 > > + beq Lmod > > + > > + C load table elements > > + li r8,1*TableElemAlign > > + lxvd2x VSR(H1M),0,TABLE > > + lxvd2x VSR(H1L),r8,TABLE > > + > > + C input loading > > + lxvd2x VSR(C0),0,DATA C load C0 > > + > > +IF_LE(` > > + vperm C0,C0,C0,LE_MASK > > +') > > + > > + C previous digest combining > > + vxor C0,C0,D > > + > > + C polynomial multiplication > > + vpmsumd F,H1L,C0 > > + vpmsumd R,H1M,C0 > > + > > + C reduction > > + vpmsumd T,F,POLY_L > > + xxswapd VSR(D),VSR(F) > > + vxor R,R,T > > + vxor D,R,D > > + > > + addi DATA,DATA,0x10 > > + clrldi LENGTH,LENGTH,60 C 'set the high-order 60 > > bits to zeros' > > +Lmod: > > + C --- process the modulo bytes, padding the low-order bytes with > > zeros --- > > + > > + cmpldi LENGTH,0 > > + beq Ldone > > + > > + C load table elements > > + li r8,1*TableElemAlign > > + lxvd2x VSR(H1M),0,TABLE > > + lxvd2x VSR(H1L),r8,TABLE > > + > > + C push every modulo byte to the stack and load them with padding into > > vector register > > + vxor ZERO,ZERO,ZERO > > + addi r8,SP,-16 > > + stvx ZERO,0,r8 > > +Lstb_loop: > > + subic. LENGTH,LENGTH,1 > > + lbzx r7,LENGTH,DATA > > + stbx r7,LENGTH,r8 > > + bne Lstb_loop > > + lxvd2x VSR(C0),0,r8 > > + > > +IF_LE(` > > + vperm C0,C0,C0,LE_MASK > > +') > > + > > + C previous digest combining > > + vxor C0,C0,D > > + > > + C polynomial multiplication > > + vpmsumd F,H1L,C0 > > + vpmsumd R,H1M,C0 > > + > > + C reduction > > + vpmsumd T,F,POLY_L > > + xxswapd VSR(D),VSR(F) > > + vxor R,R,T > > + vxor D,R,D > > + > > +Ldone: > > + C byte-reverse of each doubleword permuting on little-endian mode > > +IF_LE(` > > + vperm D,D,D,LE_MASK > > +') > > + stxvd2x VSR(D),0,X C store digest 'D' > > + > > + blr > > +EPILOGUE(_nettle_gcm_hash) > > + > > +.data > > + C 0xC2000000000000000000000000000001 > > +.polynomial: > > +.align 4 > > +IF_BE(` > > +.byte 0xC2 > > +.rept 14 > > +.byte 0x00 > > +.endr > > +.byte 0x01 > > +',` > > +.byte 0x01 > > +.rept 14 > > +.byte 0x00 > > +.endr > > +.byte 0xC2 > > +') > > > > -- > > 2.17.1 > > > _______________________________________________ > nettle-bugs mailing list > nettle-bugs@lists.lysator.liu.se > http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs -- George Wilson IBM Linux Technology Center Security Development _______________________________________________ nettle-bugs mailing list nettle-bugs@lists.lysator.liu.se http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs