[RFC PATCH 5/6] ppc: Add gcm_aes_decrypt() asm for ISA 3.0 (P9)

Christopher M. Riedl Sun, 04 Apr 2021 23:23:54 -0700

This implementation is based on the existing, per-algorithm optimized
powerpc64/p8/aes-encrypt-internal.asm and powerpc64/p8/gcm-hash.asm
implementations by Niels Möller and Mamone Tarsha. See the previous
gcm_aes_encrypt() commit for details about major changes.


Signed-off-by: Christopher M. Riedl <c...@linux.ibm.com>
---
 powerpc64/p9/gcm-aes-decrypt.asm | 663 +++++++++++++++++++++++++++++++
 1 file changed, 663 insertions(+)
 create mode 100644 powerpc64/p9/gcm-aes-decrypt.asm

diff --git a/powerpc64/p9/gcm-aes-decrypt.asm b/powerpc64/p9/gcm-aes-decrypt.asm
new file mode 100644
index 00000000..4316a487
--- /dev/null
+++ b/powerpc64/p9/gcm-aes-decrypt.asm
@@ -0,0 +1,663 @@
+C powerpc64/p9/gcm-aes-decrypt.asm
+
+ifelse(`
+   Copyright (C) 2020 Niels Möller and Mamone Tarsha
+   Copyright (C) 2021 Christopher M. Riedl
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+
+.file "gcm-aes-decrypt.asm"
+
+.text
+
+C void gcm_aes_decrypt(const struct gcm_key *key, union gcm_block *x,
+C                      size_t length, const uint8_t *src,
+C                      unsigned rounds, const uint32_t *keys,
+C                      uint8_t *dst, uint32_t *ctr)
+
+C Register usage:
+define(`SP',   `r1')
+define(`TOCP', `r2')
+
+C Parameters:
+define(`TABLE',        `r3')
+define(`X',    `r4')   C Output GCM/Ghash tag
+define(`LENGTH',`r5')
+define(`SRC',  `r6')   C Ciphertext input
+define(`ROUNDS',`r7')
+define(`KEYS', `r8')
+define(`DST',  `r9')
+define(`PCTR', `r10')  C Pointer to 12B IV and starting 4B ctr
+
+C GCM/Ghash:
+define(`POLY_L',`v0')
+define(`D',    `v1')
+define(`H1M',  `v6')
+define(`H1L',  `v7')
+define(`H2M',  `v8')
+define(`H2L',  `v9')
+define(`H3M',  `v10')
+define(`H3L',  `v11')
+define(`H4M',  `v12')
+define(`H4L',  `v13')
+define(`R',    `v14')
+define(`F',    `v15')
+define(`R2',   `v16')
+define(`F2',   `v17')
+define(`T',    `v18')
+define(`R3',   `v20')
+define(`F3',   `v21')
+define(`R4',   `v22')
+define(`F4',   `v23')
+
+C AES:
+define(`K',    `v25')
+define(`S0',   `v2')
+define(`S1',   `v3')
+define(`S2',   `v4')
+define(`S3',   `v5')
+define(`S4',   `v26')
+define(`S5',   `v27')
+define(`S6',   `v28')
+define(`S7',   `v29')
+define(`CTR',  `v30')
+define(`INC',  `v31')
+define(`C0',   `v14')
+define(`C1',   `v15')
+define(`C2',   `v16')
+define(`C3',   `v17')
+define(`C4',   `v20')
+define(`C5',   `v21')
+define(`C6',   `v22')
+define(`C7',   `v23')
+
+define(`LCNT', `r14')
+define(`ZERO', `v16')
+define(`POLY', `v24')
+C misc: r15,r16,r17
+
+define(`FUNC_ALIGN', `5')
+PROLOGUE(_nettle_gcm_aes_decrypt)
+
+       vxor            ZERO,ZERO,ZERO
+       subi            ROUNDS,ROUNDS,1         C Last AES round uses 
vcipherlast
+
+       C Store non-volatiles on the 288B stack redzone
+       std             r14,-8*1(SP)
+       std             r15,-8*2(SP)
+       std             r16,-8*3(SP)
+       std             r17,-8*4(SP)
+       stxv            VSR(v20),-16*3(SP)
+       stxv            VSR(v21),-16*4(SP)
+       stxv            VSR(v22),-16*5(SP)
+       stxv            VSR(v23),-16*6(SP)
+       stxv            VSR(v24),-16*7(SP)
+       stxv            VSR(v25),-16*8(SP)
+       stxv            VSR(v26),-16*9(SP)
+       stxv            VSR(v27),-16*10(SP)
+       stxv            VSR(v28),-16*11(SP)
+       stxv            VSR(v29),-16*12(SP)
+       stxv            VSR(v30),-16*13(SP)
+       stxv            VSR(v31),-16*14(SP)
+
+       DATA_LOAD_VEC(POLY,.polynomial,r14)
+       DATA_LOAD_VEC(INC,.increment,r14)
+
+       lxvb16x         VSR(CTR),0,PCTR         C Load 'ctr' pointer
+       xxmrghd         VSR(POLY_L),VSR(ZERO),VSR(POLY)
+       lxvb16x         VSR(D),0,X              C load 'X' pointer
+
+L8x:
+       C --- process 8 blocks '128-bit each' per one loop ---
+       srdi.           LCNT,LENGTH,7           C 8-blocks loop count 'LENGTH / 
(8 * 16)'
+       beq             L4x
+
+       C load table elements
+       li              r15,4*16
+       li              r16,5*16
+       li              r17,6*16
+       lxvd2x          VSR(H3M),r15,TABLE
+       lxvd2x          VSR(H3L),r16,TABLE
+       lxvd2x          VSR(H4M),r17,TABLE
+       li              r16,7*16
+       lxvd2x          VSR(H4L),r16,TABLE
+       li              r15,1*16
+       li              r16,2*16
+       li              r17,3*16
+       lxvd2x          VSR(H1M),0,TABLE
+       lxvd2x          VSR(H1L),r15,TABLE
+       lxvd2x          VSR(H2M),r16,TABLE
+       lxvd2x          VSR(H2L),r17,TABLE
+
+L8x_loop:
+L8x_aes:
+       lxvb16x         VSR(K),0,KEYS
+
+       C Increment ctr
+       vmr             C0,CTR
+       vadduwm         CTR,CTR,INC
+       vxor            C0,C0,K
+       vmr             C1,CTR
+       vadduwm         CTR,CTR,INC
+       vxor            C1,C1,K
+       vmr             C2,CTR
+       vadduwm         CTR,CTR,INC
+       vxor            C2,C2,K
+       vmr             C3,CTR
+       vadduwm         CTR,CTR,INC
+       vxor            C3,C3,K
+
+       mtctr           ROUNDS
+       li              r15,1*16
+
+       vmr             C4,CTR
+       vadduwm         CTR,CTR,INC
+       vxor            C4,C4,K
+       vmr             C5,CTR
+       vadduwm         CTR,CTR,INC
+       vxor            C5,C5,K
+       vmr             C6,CTR
+       vadduwm         CTR,CTR,INC
+       vxor            C6,C6,K
+       vmr             C7,CTR
+       vadduwm         CTR,CTR,INC
+       vxor            C7,C7,K
+
+.align 5
+L8x_aes_rnd_loop:
+       lxvb16x         VSR(K),r15,KEYS
+       addi            r15,r15,1*16
+       vcipher         C0,C0,K
+       vcipher         C1,C1,K
+       vcipher         C2,C2,K
+       vcipher         C3,C3,K
+       vcipher         C4,C4,K
+       vcipher         C5,C5,K
+       vcipher         C6,C6,K
+       vcipher         C7,C7,K
+       bdnz            L8x_aes_rnd_loop
+
+       lxvb16x         VSR(K),r15,KEYS
+       vcipherlast     C0,C0,K
+       vcipherlast     C1,C1,K
+       vcipherlast     C2,C2,K
+       vcipherlast     C3,C3,K
+       vcipherlast     C4,C4,K
+       vcipherlast     C5,C5,K
+       vcipherlast     C6,C6,K
+       vcipherlast     C7,C7,K
+
+       C AES(counter) XOR ciphertext = plaintext
+       li              r15,1*16
+       li              r16,2*16
+       li              r17,3*16
+       lxvb16x         VSR(S0),0,SRC
+       lxvb16x         VSR(S1),r15,SRC
+       lxvb16x         VSR(S2),r16,SRC
+       lxvb16x         VSR(S3),r17,SRC
+       vxor            C0,C0,S0
+       vxor            C1,C1,S1
+       vxor            C2,C2,S2
+       vxor            C3,C3,S3
+
+       addi            SRC,SRC,4*16
+       lxvb16x         VSR(S4),0,SRC
+       lxvb16x         VSR(S5),r15,SRC
+       lxvb16x         VSR(S6),r16,SRC
+       lxvb16x         VSR(S7),r17,SRC
+       vxor            C4,C4,S4
+       vxor            C5,C5,S5
+       vxor            C6,C6,S6
+       vxor            C7,C7,S7
+
+       C Store plaintext 
+       stxvb16x        VSR(C0),0,DST
+       stxvb16x        VSR(C1),r15,DST
+       stxvb16x        VSR(C2),r16,DST
+       stxvb16x        VSR(C3),r17,DST
+       addi            DST,DST,4*16
+       stxvb16x        VSR(C4),0,DST
+       stxvb16x        VSR(C5),r15,DST
+       stxvb16x        VSR(C6),r16,DST
+       stxvb16x        VSR(C7),r17,DST
+
+       addi            SRC,SRC,4*16
+       addi            DST,DST,4*16
+
+L8x_gcm:
+       C previous digest combining
+       vxor            S0,S0,D
+
+       C polynomial multiplication
+       vpmsumd         F2,H3L,S1
+       vpmsumd         R2,H3M,S1
+       vpmsumd         F3,H2L,S2
+       vpmsumd         R3,H2M,S2
+       vpmsumd         F4,H1L,S3
+       vpmsumd         R4,H1M,S3
+       vpmsumd         F,H4L,S0
+       vpmsumd         R,H4M,S0
+
+       C deferred recombination of partial products
+       vxor            F3,F3,F4
+       vxor            R3,R3,R4
+       vxor            F,F,F2
+       vxor            R,R,R2
+       vxor            F,F,F3
+       vxor            R,R,R3
+
+       C reduction
+       vpmsumd         T,F,POLY_L
+       xxswapd         VSR(D),VSR(F)
+       vxor            R,R,T
+       vxor            D,R,D
+
+       C previous digest combining
+       vxor            S4,S4,D
+
+       C polynomial multiplication
+       vpmsumd         F2,H3L,S5
+       vpmsumd         R2,H3M,S5
+       vpmsumd         F3,H2L,S6
+       vpmsumd         R3,H2M,S6
+       vpmsumd         F4,H1L,S7
+       vpmsumd         R4,H1M,S7
+       vpmsumd         F,H4L,S4
+       vpmsumd         R,H4M,S4
+
+       C deferred recombination of partial products
+       vxor            F3,F3,F4
+       vxor            R3,R3,R4
+       vxor            F,F,F2
+       vxor            R,R,R2
+       vxor            F,F,F3
+       vxor            R,R,R3
+
+       C reduction
+       vpmsumd         T,F,POLY_L
+       xxswapd         VSR(D),VSR(F)
+       vxor            R,R,T
+       vxor            D,R,D
+
+       C Decrement 8x block count and check if done
+       subi            LCNT,LCNT,1
+       cmpldi          LCNT,0
+       bne             L8x_loop
+       clrldi          LENGTH,LENGTH,57        C 'set the high-order 57 bits 
to zeros'
+
+L4x:
+       C --- process 4 blocks --- 
+       srdi.           LCNT,LENGTH,6           C 4-blocks loop count 'LENGTH / 
(4 * 16)'
+       beq             L2x
+
+       C load table elements
+       li              r15,4*16
+       li              r16,5*16
+       li              r17,6*16
+       lxvd2x          VSR(H3M),r15,TABLE
+       lxvd2x          VSR(H3L),r16,TABLE
+       lxvd2x          VSR(H4M),r17,TABLE
+       li              r16,7*16
+       lxvd2x          VSR(H4L),r16,TABLE
+       li              r15,1*16
+       li              r16,2*16
+       li              r17,3*16
+       lxvd2x          VSR(H1M),0,TABLE
+       lxvd2x          VSR(H1L),r15,TABLE
+       lxvd2x          VSR(H2M),r16,TABLE
+       lxvd2x          VSR(H2L),r17,TABLE
+
+L4x_aes:
+       lxvb16x         VSR(K),0,KEYS
+
+       C Increment ctr
+       vmr             C0,CTR
+       vadduwm         CTR,CTR,INC
+       vmr             C1,CTR
+       vadduwm         CTR,CTR,INC
+       vmr             C2,CTR
+       vadduwm         CTR,CTR,INC
+       vmr             C3,CTR
+       vadduwm         CTR,CTR,INC
+
+       vxor            C0,C0,K
+       vxor            C1,C1,K
+       vxor            C2,C2,K
+       vxor            C3,C3,K
+
+       mtctr           ROUNDS
+       li              r15,1*16
+
+.align 5
+L4x_aes_rnd_loop:
+       lxvb16x         VSR(K),r15,KEYS
+       vcipher         C0,C0,K
+       vcipher         C1,C1,K
+       vcipher         C2,C2,K
+       vcipher         C3,C3,K
+       addi            r15,r15,1*16
+       bdnz            L4x_aes_rnd_loop
+
+       lxvb16x         VSR(K),r15,KEYS
+       vcipherlast     C0,C0,K
+       vcipherlast     C1,C1,K
+       vcipherlast     C2,C2,K
+       vcipherlast     C3,C3,K
+
+       C AES(counter) XOR ciphertext = plaintext
+       li              r15,1*16
+       li              r16,2*16
+       li              r17,3*16
+       lxvb16x         VSR(S0),0,SRC
+       lxvb16x         VSR(S1),r15,SRC
+       lxvb16x         VSR(S2),r16,SRC
+       lxvb16x         VSR(S3),r17,SRC
+       vxor            C0,C0,S0
+       vxor            C1,C1,S1
+       vxor            C2,C2,S2
+       vxor            C3,C3,S3
+
+       C Store plaintext in DST
+       stxvb16x        VSR(C0),0,DST
+       stxvb16x        VSR(C1),r15,DST
+       stxvb16x        VSR(C2),r16,DST
+       stxvb16x        VSR(C3),r17,DST
+
+L4x_gcm:
+       C previous digest combining
+       vxor            S0,S0,D
+
+       C polynomial multiplication
+       vpmsumd         F2,H3L,S1
+       vpmsumd         R2,H3M,S1
+       vpmsumd         F3,H2L,S2
+       vpmsumd         R3,H2M,S2
+       vpmsumd         F4,H1L,S3
+       vpmsumd         R4,H1M,S3
+       vpmsumd         F,H4L,S0
+       vpmsumd         R,H4M,S0
+
+       C deferred recombination of partial products
+       vxor            F3,F3,F4
+       vxor            R3,R3,R4
+       vxor            F,F,F2
+       vxor            R,R,R2
+       vxor            F,F,F3
+       vxor            R,R,R3
+
+       C reduction
+       vpmsumd         T,F,POLY_L
+       xxswapd         VSR(D),VSR(F)
+       vxor            R,R,T
+       vxor            D,R,D
+
+       addi            DST,DST,4*16
+       addi            SRC,SRC,4*16
+       clrldi          LENGTH,LENGTH,58        C 'set the high-order 58 bits 
to zeros'
+
+L2x:
+       C --- process 2 blocks ---
+       srdi.           r14,LENGTH,5            C 'LENGTH / (2 * 16)'
+       beq             L1x
+
+       C load table elements
+       li              r15,1*16
+       li              r16,2*16
+       li              r17,3*16
+       lxvd2x          VSR(H1M),0,TABLE
+       lxvd2x          VSR(H1L),r15,TABLE
+       lxvd2x          VSR(H2M),r16,TABLE
+       lxvd2x          VSR(H2L),r17,TABLE
+
+L2x_aes:
+       lxvb16x         VSR(K),0,KEYS
+
+       C Increment ctr
+       vmr             C0,CTR
+       vadduwm         CTR,CTR,INC
+       vmr             C1,CTR
+       vadduwm         CTR,CTR,INC
+
+       vxor            C0,C0,K
+       vxor            C1,C1,K
+
+       mtctr           ROUNDS
+       li              r15,1*16
+
+.align 5
+L2x_aes_rnd_loop:
+       lxvb16x         VSR(K),r15,KEYS
+       vcipher         C0,C0,K
+       vcipher         C1,C1,K
+       addi            r15,r15,1*16
+       bdnz            L2x_aes_rnd_loop
+
+       lxvb16x         VSR(K),r15,KEYS
+       vcipherlast     C0,C0,K
+       vcipherlast     C1,C1,K
+
+       C AES(counter) XOR ciphertext = plaintext
+       li              r15,1*16
+       lxvb16x         VSR(S0),0,SRC
+       lxvb16x         VSR(S1),r15,SRC
+       vxor            C0,C0,S0
+       vxor            C1,C1,S1
+
+       C Store plaintext in DST
+       stxvb16x        VSR(C0),0,DST
+       stxvb16x        VSR(C1),r15,DST
+
+L2x_gcm:
+       C previous digest combining
+       vxor            S0,S0,D
+
+       C polynomial multiplication
+       vpmsumd         F2,H1L,S1
+       vpmsumd         R2,H1M,S1
+       vpmsumd         F,H2L,S0
+       vpmsumd         R,H2M,S0
+
+       C deferred recombination of partial products
+       vxor            F,F,F2
+       vxor            R,R,R2
+
+       C reduction
+       vpmsumd         T,F,POLY_L
+       xxswapd         VSR(D),VSR(F)
+       vxor            R,R,T
+       vxor            D,R,D
+
+       addi            DST,DST,2*16
+       addi            SRC,SRC,2*16
+       clrldi          LENGTH,LENGTH,59        C 'set the high-order 59 bits 
to zeros'
+
+L1x:
+       C --- process 1 block ---
+       srdi.           r14,LENGTH,4            C 'LENGTH / (1 * 16)'
+       beq             Lpartial
+
+       C load table elements
+       li              r15,1*16
+       lxvd2x          VSR(H1M),0,TABLE
+       lxvd2x          VSR(H1L),r15,TABLE
+
+L1x_aes:
+       lxvb16x         VSR(K),0,KEYS
+
+       C Increment ctr
+       vmr             C0,CTR
+       vadduwm         CTR,CTR,INC
+
+       vxor            C0,C0,K
+
+       mtctr           ROUNDS
+       li              r15,1*16
+
+.align 5
+L1x_aes_rnd_loop:
+       lxvb16x         VSR(K),r15,KEYS
+       vcipher         C0,C0,K
+       addi            r15,r15,1*16
+       bdnz            L1x_aes_rnd_loop
+
+       lxvb16x         VSR(K),r15,KEYS
+       vcipherlast     C0,C0,K
+
+       C AES(counter) XOR ciphertext = plaintext
+       lxvb16x         VSR(S0),0,SRC
+       vxor            C0,C0,S0
+
+       C Store plaintext in DST
+       stxvb16x        VSR(C0),0,DST
+
+L1x_gcm:
+       C previous digest combining
+       vxor            S0,S0,D
+
+       C polynomial multiplication
+       vpmsumd         F,H1L,S0
+       vpmsumd         R,H1M,S0
+
+       C reduction
+       vpmsumd         T,F,POLY_L
+       xxswapd         VSR(D),VSR(F)
+       vxor            R,R,T
+       vxor            D,R,D
+
+       addi            DST,DST,1*16
+       addi            SRC,SRC,1*16
+       clrldi          LENGTH,LENGTH,60        C 'set the high-order 60 bits 
to zeros'
+
+Lpartial:
+       C --- process partial block ---
+       cmpldi          LENGTH,0
+       beq             Ldone
+
+       C load table elements
+       li              r15,1*16
+       lxvd2x          VSR(H1M),0,TABLE
+       lxvd2x          VSR(H1L),r15,TABLE
+
+Lpartial_aes:
+       lxvb16x         VSR(K),0,KEYS
+
+       C Increment ctr
+       vmr             C0,CTR
+       vadduwm         CTR,CTR,INC
+
+       vxor            C0,C0,K
+
+       mtctr           ROUNDS
+       li              r15,1*16
+
+.align 5
+Lpartial_aes_rnd_loop:
+       lxvb16x         VSR(K),r15,KEYS
+       vcipher         C0,C0,K
+       addi            r15,r15,1*16
+       bdnz            Lpartial_aes_rnd_loop
+
+       lxvb16x         VSR(K),r15,KEYS
+       vcipherlast     C0,C0,K
+
+       C Load the partial block left-aligned and zero-padded
+       sldi            LENGTH,LENGTH,56
+       lxvll           VSR(S0),SRC,LENGTH
+
+       C AES(counter) XOR ciphertext = plaintext
+       vxor            C0,C0,S0
+
+       C Store plaintext in DST
+       stxvll          VSR(C0),DST,LENGTH
+
+Lpartial_gcm:
+       C previous digest combining
+       vxor            S0,S0,D
+
+       C polynomial multiplication
+       vpmsumd         F,H1L,S0
+       vpmsumd         R,H1M,S0
+
+       C reduction
+       vpmsumd         T,F,POLY_L
+       xxswapd         VSR(D),VSR(F)
+       vxor            R,R,T
+       vxor            D,R,D
+
+Ldone:
+       stxvb16x        VSR(D),0,X              C store digest 'D'
+       stxvb16x        VSR(CTR),0,PCTR         C store updated 'ctr'
+
+       C Restore non-volatiles from the 288B stack redzone
+       ld              r14,-8*1(SP)
+       ld              r15,-8*2(SP)
+       ld              r16,-8*3(SP)
+       ld              r17,-8*4(SP)
+       lxv             VSR(v20),-16*3(SP)
+       lxv             VSR(v21),-16*4(SP)
+       lxv             VSR(v22),-16*5(SP)
+       lxv             VSR(v23),-16*6(SP)
+       lxv             VSR(v24),-16*7(SP)
+       lxv             VSR(v25),-16*8(SP)
+       lxv             VSR(v26),-16*9(SP)
+       lxv             VSR(v27),-16*10(SP)
+       lxv             VSR(v28),-16*11(SP)
+       lxv             VSR(v29),-16*12(SP)
+       lxv             VSR(v30),-16*13(SP)
+       lxv             VSR(v31),-16*14(SP)
+
+       li              r3,0                    C return 0 for success
+       blr
+
+EPILOGUE(_nettle_gcm_aes_decrypt)
+
+.data
+.align 4
+C 0xC2000000000000000000000000000001
+.polynomial:
+IF_BE(`
+       .byte 0xC2
+       .rept 14
+       .byte 0x00
+       .endr
+       .byte 0x01
+',`
+       .byte 0x01
+       .rept 14
+       .byte 0x00
+       .endr
+       .byte 0xC2
+')
+.align 4
+.increment:
+IF_LE(`
+       .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+')
+IF_BE(`
+       .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+')
-- 
2.26.1

_______________________________________________
nettle-bugs mailing list
nettle-bugs@lists.lysator.liu.se
http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs

[RFC PATCH 5/6] ppc: Add gcm_aes_decrypt() asm for ISA 3.0 (P9)

Reply via email to