I made this patch operate AES ciphering with fixed key sizes of 128-bit,
192-bit, and 256-bit, in this case I eliminated the loading process of key
expansion for every round. Since this technique produces performance
benefits, I'm planning to keep the implementation as is and in case
handling uncommon key size is mandatory, I can append additional branch to
process message blocks with any key size. What do you think?

regards,
Mamone

On Sat, May 1, 2021 at 5:39 PM Maamoun TK <maamoun...@googlemail.com> wrote:

> This patch optimizes nettle_aes_encrypt() and nettle_aes_decrypt()
> functions for arm64 architecture, it takes advantage of 'aese' and 'aesmc'
> instructions to optimize the encryption function and 'aesd' and 'aesimc' to
> optimize the decryption function.
>
> The patch passes the testsuite of nettle. I also run the benchmark on
> gcc117 instance of CFarm by configuring the library with "--disable-fat
> --enable-arm64-crypto" options then executing examples/nettle-benchmark:
>
> aes128  ECB encrypt 2522.67
> aes128  ECB decrypt 2522.53
> aes192  ECB encrypt 2165.06
> aes192  ECB decrypt 2165.04
> aes256  ECB encrypt 1866.80
> aes256  ECB decrypt 1866.38
>
> openssl aes128  ECB encrypt 1043.52
> openssl aes128  ECB decrypt 1043.05
> openssl aes192  ECB encrypt  904.08
> openssl aes192  ECB decrypt  903.85
> openssl aes256  ECB encrypt  787.43
> openssl aes256  ECB decrypt  787.20
>
> gcm_aes128      encrypt  955.10
> gcm_aes128      decrypt  955.06
> gcm_aes128       update 3269.18
> gcm_aes192      encrypt  896.26
> gcm_aes192      decrypt  896.46
> gcm_aes192       update 3270.24
> gcm_aes256      encrypt  840.17
> gcm_aes256      decrypt  843.53
> gcm_aes256       update 3270.08
>
> openssl gcm_aes128      encrypt  894.51
> openssl gcm_aes128      decrypt  899.05
> openssl gcm_aes128       update 1636.61
> openssl gcm_aes192      encrypt  834.94
> openssl gcm_aes192      decrypt  841.99
> openssl gcm_aes192       update 1631.40
> openssl gcm_aes256      encrypt  788.48
> openssl gcm_aes256      decrypt  791.31
> openssl gcm_aes256       update 1635.18
>
> I'm a little suspicious about the benchmark numbers because as I remember
> the performance of gcm update doesn't double the openssl number, I repeat
> running the process but kept giving the same performance margin.
>
> ---
>  arm64/crypto/aes-decrypt-internal.asm | 223
> ++++++++++++++++++++++++++++++++++
>  arm64/crypto/aes-encrypt-internal.asm | 223
> ++++++++++++++++++++++++++++++++++
>  2 files changed, 446 insertions(+)
>  create mode 100644 arm64/crypto/aes-decrypt-internal.asm
>  create mode 100644 arm64/crypto/aes-encrypt-internal.asm
>
> diff --git a/arm64/crypto/aes-decrypt-internal.asm
> b/arm64/crypto/aes-decrypt-internal.asm
> new file mode 100644
> index 00000000..4bfdb314
> --- /dev/null
> +++ b/arm64/crypto/aes-decrypt-internal.asm
> @@ -0,0 +1,223 @@
> +C arm64/crypto/aes-decrypt-internal.asm
> +
> +ifelse(`
> +   Copyright (C) 2021 Mamone Tarsha
> +   This file is part of GNU Nettle.
> +
> +   GNU Nettle is free software: you can redistribute it and/or
> +   modify it under the terms of either:
> +
> +     * the GNU Lesser General Public License as published by the Free
> +       Software Foundation; either version 3 of the License, or (at your
> +       option) any later version.
> +
> +   or
> +
> +     * the GNU General Public License as published by the Free
> +       Software Foundation; either version 2 of the License, or (at your
> +       option) any later version.
> +
> +   or both in parallel, as here.
> +
> +   GNU Nettle is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   General Public License for more details.
> +
> +   You should have received copies of the GNU General Public License and
> +   the GNU Lesser General Public License along with this program.  If
> +   not, see http://www.gnu.org/licenses/.
> +')
> +
> +.file "aes-decrypt-internal.asm"
> +.arch armv8-a+crypto
> +
> +.text
> +
> +C Register usage:
> +
> +define(`ROUNDS', `x0')
> +define(`KEYS', `x1')
> +define(`LENGTH', `x3')
> +define(`DST', `x4')
> +define(`SRC', `x5')
> +
> +define(`S0', `v0')
> +define(`S1', `v1')
> +define(`S2', `v2')
> +define(`S3', `v3')
> +define(`K0', `v16')
> +define(`K1', `v17')
> +define(`K2', `v18')
> +define(`K3', `v19')
> +define(`K4', `v20')
> +define(`K5', `v21')
> +define(`K6', `v22')
> +define(`K7', `v23')
> +define(`K8', `v24')
> +define(`K9', `v25')
> +define(`K10', `v26')
> +define(`K11', `v27')
> +define(`K12', `v28')
> +define(`K13', `v29')
> +define(`K14', `v30')
> +
> +C AES_ROUND_4B(KEY)
> +define(`AES_ROUND_4B', m4_assert_numargs(1)`
> +    aesd           S0.16b,$1.16b
> +    aesimc         S0.16b,S0.16b
> +    aesd           S1.16b,$1.16b
> +    aesimc         S1.16b,S1.16b
> +    aesd           S2.16b,$1.16b
> +    aesimc         S2.16b,S2.16b
> +    aesd           S3.16b,$1.16b
> +    aesimc         S3.16b,S3.16b
> +')
> +
> +C AES_LAST_ROUND_4B(KEY)
> +define(`AES_LAST_ROUND_4B', m4_assert_numargs(1)`
> +    aesd           S0.16b,$1.16b
> +    eor            S0.16b,S0.16b,K14.16b
> +    aesd           S1.16b,$1.16b
> +    eor            S1.16b,S1.16b,K14.16b
> +    aesd           S2.16b,$1.16b
> +    eor            S2.16b,S2.16b,K14.16b
> +    aesd           S3.16b,$1.16b
> +    eor            S3.16b,S3.16b,K14.16b
> +')
> +
> +C AES_ROUND_1B(KEY)
> +define(`AES_ROUND_1B', m4_assert_numargs(1)`
> +    aesd           S0.16b,$1.16b
> +    aesimc         S0.16b,S0.16b
> +')
> +
> +C AES_LAST_ROUND_1B(KEY)
> +define(`AES_LAST_ROUND_1B', m4_assert_numargs(1)`
> +    aesd           S0.16b,$1.16b
> +    eor            S0.16b,S0.16b,K14.16b
> +')
> +
> +C _aes_decrypt(unsigned rounds, const uint32_t *keys,
> +C       const struct aes_table *T,
> +C       size_t length, uint8_t *dst,
> +C       const uint8_t *src)
> +
> +PROLOGUE(_nettle_aes_decrypt)
> +    ands           x6,LENGTH,#-64
> +    b.eq           L1B
> +
> +    mov            x7,KEYS
> +    ld1            {K0.4s,K1.4s,K2.4s,K3.4s},[x7],#64
> +    ld1            {K4.4s,K5.4s,K6.4s,K7.4s},[x7],#64
> +    ld1            {K8.4s,K9.4s},[x7],#32
> +    cmp            ROUNDS,#10
> +    b.eq           L4B_last_key
> +    ld1            {K10.4s,K11.4s},[x7],#32
> +    cmp            ROUNDS,#12
> +    b.eq           L4B_last_key
> +    ld1            {K12.4s,K13.4s},[x7],#32
> +
> +L4B_last_key:
> +    ld1            {K14.4s},[x7]
> +
> +L4B_loop:
> +    ld1            {S0.16b,S1.16b,S2.16b,S3.16b},[SRC],#64
> +
> +    AES_ROUND_4B(K0)
> +    AES_ROUND_4B(K1)
> +    AES_ROUND_4B(K2)
> +    AES_ROUND_4B(K3)
> +    AES_ROUND_4B(K4)
> +    AES_ROUND_4B(K5)
> +    AES_ROUND_4B(K6)
> +    AES_ROUND_4B(K7)
> +    AES_ROUND_4B(K8)
> +    cmp            ROUNDS,#10
> +    b.eq           L4B_10_round
> +    cmp            ROUNDS,#12
> +    b.eq           L4B_12_round
> +    b              L4B_14_round
> +
> +L4B_10_round:
> +    AES_LAST_ROUND_4B(K9)
> +    b              L4B_done
> +L4B_12_round:
> +    AES_ROUND_4B(K9)
> +    AES_ROUND_4B(K10)
> +    AES_LAST_ROUND_4B(K11)
> +    b              L4B_done
> +L4B_14_round:
> +    AES_ROUND_4B(K9)
> +    AES_ROUND_4B(K10)
> +    AES_ROUND_4B(K11)
> +    AES_ROUND_4B(K12)
> +    AES_LAST_ROUND_4B(K13)
> +
> +L4B_done:
> +    st1            {S0.16b,S1.16b,S2.16b,S3.16b},[DST],#64
> +
> +    subs           x6,x6,#64
> +    b.ne           L4B_loop
> +
> +    and            LENGTH,LENGTH,#63
> +
> +L1B:
> +    cbz            LENGTH,Ldone
> +
> +    mov            x6,KEYS
> +    ld1            {K0.4s,K1.4s,K2.4s,K3.4s},[x6],#64
> +    ld1            {K4.4s,K5.4s,K6.4s,K7.4s},[x6],#64
> +    ld1            {K8.4s,K9.4s},[x6],#32
> +    cmp            ROUNDS,#10
> +    b.eq           L1B_last_key
> +    ld1            {K10.4s,K11.4s},[x6],#32
> +    cmp            ROUNDS,#12
> +    b.eq           L1B_last_key
> +    ld1            {K12.4s,K13.4s},[x6],#32
> +
> +L1B_last_key:
> +    ld1            {K14.4s},[x6]
> +
> +L1B_loop:
> +    ld1            {S0.16b},[SRC],#16
> +
> +    AES_ROUND_1B(K0)
> +    AES_ROUND_1B(K1)
> +    AES_ROUND_1B(K2)
> +    AES_ROUND_1B(K3)
> +    AES_ROUND_1B(K4)
> +    AES_ROUND_1B(K5)
> +    AES_ROUND_1B(K6)
> +    AES_ROUND_1B(K7)
> +    AES_ROUND_1B(K8)
> +    cmp            ROUNDS,#10
> +    b.eq           L1B_10_round
> +    cmp            ROUNDS,#12
> +    b.eq           L1B_12_round
> +    b              L1B_14_round
> +
> +L1B_10_round:
> +    AES_LAST_ROUND_1B(K9)
> +    b              L1B_done
> +L1B_12_round:
> +    AES_ROUND_1B(K9)
> +    AES_ROUND_1B(K10)
> +    AES_LAST_ROUND_1B(K11)
> +    b              L1B_done
> +L1B_14_round:
> +    AES_ROUND_1B(K9)
> +    AES_ROUND_1B(K10)
> +    AES_ROUND_1B(K11)
> +    AES_ROUND_1B(K12)
> +    AES_LAST_ROUND_1B(K13)
> +
> +L1B_done:
> +    st1            {S0.16b},[DST],#16
> +
> +    subs           LENGTH,LENGTH,#16
> +    b.ne           L1B_loop
> +
> +Ldone:
> +    ret
> +EPILOGUE(_nettle_aes_decrypt)
> diff --git a/arm64/crypto/aes-encrypt-internal.asm
> b/arm64/crypto/aes-encrypt-internal.asm
> new file mode 100644
> index 00000000..314f9333
> --- /dev/null
> +++ b/arm64/crypto/aes-encrypt-internal.asm
> @@ -0,0 +1,223 @@
> +C arm64/crypto/aes-encrypt-internal.asm
> +
> +ifelse(`
> +   Copyright (C) 2021 Mamone Tarsha
> +   This file is part of GNU Nettle.
> +
> +   GNU Nettle is free software: you can redistribute it and/or
> +   modify it under the terms of either:
> +
> +     * the GNU Lesser General Public License as published by the Free
> +       Software Foundation; either version 3 of the License, or (at your
> +       option) any later version.
> +
> +   or
> +
> +     * the GNU General Public License as published by the Free
> +       Software Foundation; either version 2 of the License, or (at your
> +       option) any later version.
> +
> +   or both in parallel, as here.
> +
> +   GNU Nettle is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   General Public License for more details.
> +
> +   You should have received copies of the GNU General Public License and
> +   the GNU Lesser General Public License along with this program.  If
> +   not, see http://www.gnu.org/licenses/.
> +')
> +
> +.file "aes-encrypt-internal.asm"
> +.arch armv8-a+crypto
> +
> +.text
> +
> +C Register usage:
> +
> +define(`ROUNDS', `x0')
> +define(`KEYS', `x1')
> +define(`LENGTH', `x3')
> +define(`DST', `x4')
> +define(`SRC', `x5')
> +
> +define(`S0', `v0')
> +define(`S1', `v1')
> +define(`S2', `v2')
> +define(`S3', `v3')
> +define(`K0', `v16')
> +define(`K1', `v17')
> +define(`K2', `v18')
> +define(`K3', `v19')
> +define(`K4', `v20')
> +define(`K5', `v21')
> +define(`K6', `v22')
> +define(`K7', `v23')
> +define(`K8', `v24')
> +define(`K9', `v25')
> +define(`K10', `v26')
> +define(`K11', `v27')
> +define(`K12', `v28')
> +define(`K13', `v29')
> +define(`K14', `v30')
> +
> +C AES_ROUND_4B(KEY)
> +define(`AES_ROUND_4B', m4_assert_numargs(1)`
> +    aese           S0.16b,$1.16b
> +    aesmc          S0.16b,S0.16b
> +    aese           S1.16b,$1.16b
> +    aesmc          S1.16b,S1.16b
> +    aese           S2.16b,$1.16b
> +    aesmc          S2.16b,S2.16b
> +    aese           S3.16b,$1.16b
> +    aesmc          S3.16b,S3.16b
> +')
> +
> +C AES_LAST_ROUND_4B(KEY)
> +define(`AES_LAST_ROUND_4B', m4_assert_numargs(1)`
> +    aese           S0.16b,$1.16b
> +    eor            S0.16b,S0.16b,K14.16b
> +    aese           S1.16b,$1.16b
> +    eor            S1.16b,S1.16b,K14.16b
> +    aese           S2.16b,$1.16b
> +    eor            S2.16b,S2.16b,K14.16b
> +    aese           S3.16b,$1.16b
> +    eor            S3.16b,S3.16b,K14.16b
> +')
> +
> +C AES_ROUND_1B(KEY)
> +define(`AES_ROUND_1B', m4_assert_numargs(1)`
> +    aese           S0.16b,$1.16b
> +    aesmc          S0.16b,S0.16b
> +')
> +
> +C AES_LAST_ROUND_1B(KEY)
> +define(`AES_LAST_ROUND_1B', m4_assert_numargs(1)`
> +    aese           S0.16b,$1.16b
> +    eor            S0.16b,S0.16b,K14.16b
> +')
> +
> +C _aes_encrypt(unsigned rounds, const uint32_t *keys,
> +C       const struct aes_table *T,
> +C       size_t length, uint8_t *dst,
> +C       uint8_t *src)
> +
> +PROLOGUE(_nettle_aes_encrypt)
> +    ands           x6,LENGTH,#-64
> +    b.eq           L1B
> +
> +    mov            x7,KEYS
> +    ld1            {K0.4s,K1.4s,K2.4s,K3.4s},[x7],#64
> +    ld1            {K4.4s,K5.4s,K6.4s,K7.4s},[x7],#64
> +    ld1            {K8.4s,K9.4s},[x7],#32
> +    cmp            ROUNDS,#10
> +    b.eq           L4B_last_key
> +    ld1            {K10.4s,K11.4s},[x7],#32
> +    cmp            ROUNDS,#12
> +    b.eq           L4B_last_key
> +    ld1            {K12.4s,K13.4s},[x7],#32
> +
> +L4B_last_key:
> +    ld1            {K14.4s},[x7]
> +
> +L4B_loop:
> +    ld1            {S0.16b,S1.16b,S2.16b,S3.16b},[SRC],#64
> +
> +    AES_ROUND_4B(K0)
> +    AES_ROUND_4B(K1)
> +    AES_ROUND_4B(K2)
> +    AES_ROUND_4B(K3)
> +    AES_ROUND_4B(K4)
> +    AES_ROUND_4B(K5)
> +    AES_ROUND_4B(K6)
> +    AES_ROUND_4B(K7)
> +    AES_ROUND_4B(K8)
> +    cmp            ROUNDS,#10
> +    b.eq           L4B_10_round
> +    cmp            ROUNDS,#12
> +    b.eq           L4B_12_round
> +    b              L4B_14_round
> +
> +L4B_10_round:
> +    AES_LAST_ROUND_4B(K9)
> +    b              L4B_done
> +L4B_12_round:
> +    AES_ROUND_4B(K9)
> +    AES_ROUND_4B(K10)
> +    AES_LAST_ROUND_4B(K11)
> +    b              L4B_done
> +L4B_14_round:
> +    AES_ROUND_4B(K9)
> +    AES_ROUND_4B(K10)
> +    AES_ROUND_4B(K11)
> +    AES_ROUND_4B(K12)
> +    AES_LAST_ROUND_4B(K13)
> +
> +L4B_done:
> +    st1            {S0.16b,S1.16b,S2.16b,S3.16b},[DST],#64
> +
> +    subs           x6,x6,#64
> +    b.ne           L4B_loop
> +
> +    and            LENGTH,LENGTH,#63
> +
> +L1B:
> +    cbz            LENGTH,Ldone
> +
> +    mov            x6,KEYS
> +    ld1            {K0.4s,K1.4s,K2.4s,K3.4s},[x6],#64
> +    ld1            {K4.4s,K5.4s,K6.4s,K7.4s},[x6],#64
> +    ld1            {K8.4s,K9.4s},[x6],#32
> +    cmp            ROUNDS,#10
> +    b.eq           L1B_last_key
> +    ld1            {K10.4s,K11.4s},[x6],#32
> +    cmp            ROUNDS,#12
> +    b.eq           L1B_last_key
> +    ld1            {K12.4s,K13.4s},[x6],#32
> +
> +L1B_last_key:
> +    ld1            {K14.4s},[x6]
> +
> +L1B_loop:
> +    ld1            {S0.16b},[SRC],#16
> +
> +    AES_ROUND_1B(K0)
> +    AES_ROUND_1B(K1)
> +    AES_ROUND_1B(K2)
> +    AES_ROUND_1B(K3)
> +    AES_ROUND_1B(K4)
> +    AES_ROUND_1B(K5)
> +    AES_ROUND_1B(K6)
> +    AES_ROUND_1B(K7)
> +    AES_ROUND_1B(K8)
> +    cmp            ROUNDS,#10
> +    b.eq           L1B_10_round
> +    cmp            ROUNDS,#12
> +    b.eq           L1B_12_round
> +    b              L1B_14_round
> +
> +L1B_10_round:
> +    AES_LAST_ROUND_1B(K9)
> +    b              L1B_done
> +L1B_12_round:
> +    AES_ROUND_1B(K9)
> +    AES_ROUND_1B(K10)
> +    AES_LAST_ROUND_1B(K11)
> +    b              L1B_done
> +L1B_14_round:
> +    AES_ROUND_1B(K9)
> +    AES_ROUND_1B(K10)
> +    AES_ROUND_1B(K11)
> +    AES_ROUND_1B(K12)
> +    AES_LAST_ROUND_1B(K13)
> +
> +L1B_done:
> +    st1            {S0.16b},[DST],#16
> +
> +    subs           LENGTH,LENGTH,#16
> +    b.ne           L1B_loop
> +
> +Ldone:
> +    ret
> +EPILOGUE(_nettle_aes_encrypt)
>
> --
> 2.25.1
>
_______________________________________________
nettle-bugs mailing list
nettle-bugs@lists.lysator.liu.se
http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs

Reply via email to