Module Name: src
Committed By: christos
Date: Wed May 31 19:35:31 UTC 2023
Modified Files:
src/crypto/external/bsd/openssl/lib/libcrypto/arch/aarch64:
aes-gcm-armv8_64.S aesv8-armx.S
src/crypto/external/bsd/openssl/lib/libcrypto/arch/arm:
aes-gcm-armv8_64.S bsaes-armv7.S
src/crypto/external/bsd/openssl/lib/libcrypto/arch/powerpc:
ecp_nistp521-ppc64.S
src/crypto/external/bsd/openssl/lib/libcrypto/arch/powerpc64:
ecp_nistp521-ppc64.S
Log Message:
regen
To generate a diff of this commit:
cvs rdiff -u -r1.2 -r1.3 \
src/crypto/external/bsd/openssl/lib/libcrypto/arch/aarch64/aes-gcm-armv8_64.S
cvs rdiff -u -r1.5 -r1.6 \
src/crypto/external/bsd/openssl/lib/libcrypto/arch/aarch64/aesv8-armx.S
cvs rdiff -u -r1.1 -r1.2 \
src/crypto/external/bsd/openssl/lib/libcrypto/arch/arm/aes-gcm-armv8_64.S
cvs rdiff -u -r1.6 -r1.7 \
src/crypto/external/bsd/openssl/lib/libcrypto/arch/arm/bsaes-armv7.S
cvs rdiff -u -r1.1 -r1.2 \
src/crypto/external/bsd/openssl/lib/libcrypto/arch/powerpc/ecp_nistp521-ppc64.S
cvs rdiff -u -r1.1 -r1.2 \
src/crypto/external/bsd/openssl/lib/libcrypto/arch/powerpc64/ecp_nistp521-ppc64.S
Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.
Modified files:
Index: src/crypto/external/bsd/openssl/lib/libcrypto/arch/aarch64/aes-gcm-armv8_64.S
diff -u src/crypto/external/bsd/openssl/lib/libcrypto/arch/aarch64/aes-gcm-armv8_64.S:1.2 src/crypto/external/bsd/openssl/lib/libcrypto/arch/aarch64/aes-gcm-armv8_64.S:1.3
--- src/crypto/external/bsd/openssl/lib/libcrypto/arch/aarch64/aes-gcm-armv8_64.S:1.2 Wed May 10 21:31:54 2023
+++ src/crypto/external/bsd/openssl/lib/libcrypto/arch/aarch64/aes-gcm-armv8_64.S Wed May 31 15:35:31 2023
@@ -19,28 +19,36 @@ aes_gcm_enc_128_kernel:
stp d14, d15, [sp, #96]
ldp x10, x11, [x16] //ctr96_b64, ctr96_t32
+#ifdef __AARCH64EB__
+ rev x10, x10
+ rev x11, x11
+#endif
ldp x13, x14, [x8, #160] //load rk10
-
+#ifdef __AARCH64EB__
+ ror x13, x13, #32
+ ror x14, x14, #32
+#endif
ld1 {v11.16b}, [x3]
ext v11.16b, v11.16b, v11.16b, #8
rev64 v11.16b, v11.16b
lsr x5, x1, #3 //byte_len
mov x15, x5
- ldr q27, [x8, #144] //load rk9
+ ld1 {v18.4s}, [x8], #16 //load rk0
add x4, x0, x1, lsr #3 //end_input_ptr
sub x5, x5, #1 //byte_len - 1
lsr x12, x11, #32
ldr q15, [x3, #112] //load h4l | h4h
+#ifndef __AARCH64EB__
ext v15.16b, v15.16b, v15.16b, #8
-
+#endif
fmov d1, x10 //CTR block 1
rev w12, w12 //rev_ctr32
add w12, w12, #1 //increment rev_ctr32
orr w11, w11, w11
- ldr q18, [x8, #0] //load rk0
+ ld1 {v19.4s}, [x8], #16 //load rk1
rev w9, w12 //CTR block 1
add w12, w12, #1 //CTR block 1
@@ -60,30 +68,33 @@ aes_gcm_enc_128_kernel:
rev w9, w12 //CTR block 3
orr x9, x11, x9, lsl #32 //CTR block 3
- ldr q19, [x8, #16] //load rk1
+ ld1 {v20.4s}, [x8], #16 //load rk2
add w12, w12, #1 //CTR block 3
fmov v3.d[1], x9 //CTR block 3
ldr q14, [x3, #80] //load h3l | h3h
+#ifndef __AARCH64EB__
ext v14.16b, v14.16b, v14.16b, #8
-
+#endif
aese v1.16b, v18.16b
aesmc v1.16b, v1.16b //AES block 1 - round 0
- ldr q20, [x8, #32] //load rk2
+ ld1 {v21.4s}, [x8], #16 //load rk3
aese v2.16b, v18.16b
aesmc v2.16b, v2.16b //AES block 2 - round 0
ldr q12, [x3, #32] //load h1l | h1h
+#ifndef __AARCH64EB__
ext v12.16b, v12.16b, v12.16b, #8
+#endif
aese v0.16b, v18.16b
aesmc v0.16b, v0.16b //AES block 0 - round 0
- ldr q26, [x8, #128] //load rk8
+ ld1 {v22.4s}, [x8], #16 //load rk4
aese v3.16b, v18.16b
aesmc v3.16b, v3.16b //AES block 3 - round 0
- ldr q21, [x8, #48] //load rk3
+ ld1 {v23.4s}, [x8], #16 //load rk5
aese v2.16b, v19.16b
aesmc v2.16b, v2.16b //AES block 2 - round 1
@@ -91,11 +102,11 @@ aes_gcm_enc_128_kernel:
aese v0.16b, v19.16b
aesmc v0.16b, v0.16b //AES block 0 - round 1
- ldr q24, [x8, #96] //load rk6
+ ld1 {v24.4s}, [x8], #16 //load rk6
aese v1.16b, v19.16b
aesmc v1.16b, v1.16b //AES block 1 - round 1
- ldr q25, [x8, #112] //load rk7
+ ld1 {v25.4s}, [x8], #16 //load rk7
aese v3.16b, v19.16b
aesmc v3.16b, v3.16b //AES block 3 - round 1
@@ -103,12 +114,14 @@ aes_gcm_enc_128_kernel:
aese v0.16b, v20.16b
aesmc v0.16b, v0.16b //AES block 0 - round 2
- ldr q23, [x8, #80] //load rk5
+ ld1 {v26.4s}, [x8], #16 //load rk8
aese v1.16b, v20.16b
aesmc v1.16b, v1.16b //AES block 1 - round 2
ldr q13, [x3, #64] //load h2l | h2h
+#ifndef __AARCH64EB__
ext v13.16b, v13.16b, v13.16b, #8
+#endif
aese v3.16b, v20.16b
aesmc v3.16b, v3.16b //AES block 3 - round 2
@@ -125,7 +138,7 @@ aes_gcm_enc_128_kernel:
aese v2.16b, v21.16b
aesmc v2.16b, v2.16b //AES block 2 - round 3
- ldr q22, [x8, #64] //load rk4
+ ld1 {v27.4s}, [x8], #16 //load rk9
aese v3.16b, v21.16b
aesmc v3.16b, v3.16b //AES block 3 - round 3
@@ -208,13 +221,25 @@ aes_gcm_enc_128_kernel:
b.ge .L128_enc_tail //handle tail
ldp x6, x7, [x0, #0] //AES block 0 - load plaintext
-
+#ifdef __AARCH64EB__
+ rev x6, x6
+ rev x7, x7
+#endif
ldp x21, x22, [x0, #32] //AES block 2 - load plaintext
-
+#ifdef __AARCH64EB__
+ rev x21, x21
+ rev x22, x22
+#endif
ldp x19, x20, [x0, #16] //AES block 1 - load plaintext
-
+#ifdef __AARCH64EB__
+ rev x19, x19
+ rev x20, x20
+#endif
ldp x23, x24, [x0, #48] //AES block 3 - load plaintext
-
+#ifdef __AARCH64EB__
+ rev x23, x23
+ rev x24, x24
+#endif
eor x6, x6, x13 //AES block 0 - round 10 low
eor x7, x7, x14 //AES block 0 - round 10 high
@@ -279,6 +304,10 @@ aes_gcm_enc_128_kernel:
.L128_enc_main_loop: //main loop start
ldp x23, x24, [x0, #48] //AES block 4k+3 - load plaintext
+#ifdef __AARCH64EB__
+ rev x23, x23
+ rev x24, x24
+#endif
rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free)
rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free)
@@ -313,7 +342,10 @@ aes_gcm_enc_128_kernel:
pmull2 v28.1q, v5.2d, v14.2d //GHASH block 4k+1 - high
eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid
ldp x6, x7, [x0, #0] //AES block 4k+4 - load plaintext
-
+#ifdef __AARCH64EB__
+ rev x6, x6
+ rev x7, x7
+#endif
aese v0.16b, v19.16b
aesmc v0.16b, v0.16b //AES block 4k+4 - round 1
rev w9, w12 //CTR block 4k+8
@@ -395,7 +427,10 @@ aes_gcm_enc_128_kernel:
aese v1.16b, v23.16b
aesmc v1.16b, v1.16b //AES block 4k+5 - round 5
ldp x19, x20, [x0, #16] //AES block 4k+5 - load plaintext
-
+#ifdef __AARCH64EB__
+ rev x19, x19
+ rev x20, x20
+#endif
aese v3.16b, v21.16b
aesmc v3.16b, v3.16b //AES block 4k+7 - round 3
eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid
@@ -403,7 +438,10 @@ aes_gcm_enc_128_kernel:
aese v0.16b, v23.16b
aesmc v0.16b, v0.16b //AES block 4k+4 - round 5
ldp x21, x22, [x0, #32] //AES block 4k+6 - load plaintext
-
+#ifdef __AARCH64EB__
+ rev x21, x21
+ rev x22, x22
+#endif
pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid
eor v11.16b, v11.16b, v29.16b //GHASH block 4k+3 - low
@@ -712,7 +750,10 @@ aes_gcm_enc_128_kernel:
sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process
ldp x6, x7, [x0], #16 //AES block 4k+4 - load plaintext
-
+#ifdef __AARCH64EB__
+ rev x6, x6
+ rev x7, x7
+#endif
cmp x5, #48
ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag
@@ -750,7 +791,10 @@ aes_gcm_enc_128_kernel:
st1 { v5.16b}, [x2], #16 //AES final-3 block - store result
ldp x6, x7, [x0], #16 //AES final-2 block - load input low & high
-
+#ifdef __AARCH64EB__
+ rev x6, x6
+ rev x7, x7
+#endif
rev64 v4.16b, v5.16b //GHASH final-3 block
eor v4.16b, v4.16b, v8.16b //feed in partial tag
@@ -779,7 +823,10 @@ aes_gcm_enc_128_kernel:
rev64 v4.16b, v5.16b //GHASH final-2 block
ldp x6, x7, [x0], #16 //AES final-1 block - load input low & high
-
+#ifdef __AARCH64EB__
+ rev x6, x6
+ rev x7, x7
+#endif
eor v4.16b, v4.16b, v8.16b //feed in partial tag
eor x6, x6, x13 //AES final-1 block - round 10 low
@@ -813,7 +860,10 @@ aes_gcm_enc_128_kernel:
rev64 v4.16b, v5.16b //GHASH final-1 block
ldp x6, x7, [x0], #16 //AES final block - load input low & high
-
+#ifdef __AARCH64EB__
+ rev x6, x6
+ rev x7, x7
+#endif
eor v4.16b, v4.16b, v8.16b //feed in partial tag
eor x7, x7, x14 //AES final block - round 10 high
@@ -876,9 +926,11 @@ aes_gcm_enc_128_kernel:
ld1 { v18.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored
eor v8.8b, v8.8b, v4.8b //GHASH final block - mid
-
+#ifndef __AARCH64EB__
rev w9, w12
-
+#else
+ mov w9, w12
+#endif
pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high
pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid
@@ -951,20 +1003,29 @@ aes_gcm_dec_128_kernel:
lsr x5, x1, #3 //byte_len
mov x15, x5
ldp x10, x11, [x16] //ctr96_b64, ctr96_t32
-
+#ifdef __AARCH64EB__
+ rev x10, x10
+ rev x11, x11
+#endif
+ ldp x13, x14, [x8, #160] //load rk10
+#ifdef __AARCH64EB__
+ ror x14, x14, 32
+ ror x13, x13, 32
+#endif
sub x5, x5, #1 //byte_len - 1
- ldr q18, [x8, #0] //load rk0
+ ld1 {v18.4s}, [x8], #16 //load rk0
and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible
ldr q13, [x3, #64] //load h2l | h2h
+#ifndef __AARCH64EB__
ext v13.16b, v13.16b, v13.16b, #8
-
+#endif
lsr x12, x11, #32
fmov d2, x10 //CTR block 2
- ldr q19, [x8, #16] //load rk1
+ ld1 {v19.4s}, [x8], #16 //load rk1
orr w11, w11, w11
rev w12, w12 //rev_ctr32
@@ -976,7 +1037,7 @@ aes_gcm_dec_128_kernel:
rev w9, w12 //CTR block 1
orr x9, x11, x9, lsl #32 //CTR block 1
- ldr q20, [x8, #32] //load rk2
+ ld1 {v20.4s}, [x8], #16 //load rk2
add w12, w12, #1 //CTR block 1
fmov v1.d[1], x9 //CTR block 1
@@ -999,19 +1060,19 @@ aes_gcm_dec_128_kernel:
aese v1.16b, v18.16b
aesmc v1.16b, v1.16b //AES block 1 - round 0
- ldr q21, [x8, #48] //load rk3
+ ld1 {v21.4s}, [x8], #16 //load rk3
aese v0.16b, v20.16b
aesmc v0.16b, v0.16b //AES block 0 - round 2
- ldr q24, [x8, #96] //load rk6
+ ld1 {v22.4s}, [x8], #16 //load rk4
aese v2.16b, v18.16b
aesmc v2.16b, v2.16b //AES block 2 - round 0
- ldr q25, [x8, #112] //load rk7
+ ld1 {v23.4s}, [x8], #16 //load rk5
aese v1.16b, v19.16b
aesmc v1.16b, v1.16b //AES block 1 - round 1
- ldr q22, [x8, #64] //load rk4
+ ld1 {v24.4s}, [x8], #16 //load rk6
aese v3.16b, v18.16b
aesmc v3.16b, v3.16b //AES block 3 - round 0
@@ -1021,7 +1082,6 @@ aes_gcm_dec_128_kernel:
aese v1.16b, v20.16b
aesmc v1.16b, v1.16b //AES block 1 - round 2
- ldp x13, x14, [x8, #160] //load rk10
aese v3.16b, v19.16b
aesmc v3.16b, v3.16b //AES block 3 - round 1
@@ -1031,7 +1091,7 @@ aes_gcm_dec_128_kernel:
aese v0.16b, v21.16b
aesmc v0.16b, v0.16b //AES block 0 - round 3
- ldr q23, [x8, #80] //load rk5
+ ld1 {v25.4s}, [x8], #16 //load rk7
aese v1.16b, v21.16b
aesmc v1.16b, v1.16b //AES block 1 - round 3
@@ -1041,7 +1101,7 @@ aes_gcm_dec_128_kernel:
aese v2.16b, v20.16b
aesmc v2.16b, v2.16b //AES block 2 - round 2
- ldr q27, [x8, #144] //load rk9
+ ld1 {v26.4s}, [x8], #16 //load rk8
aese v1.16b, v22.16b
aesmc v1.16b, v1.16b //AES block 1 - round 4
@@ -1052,11 +1112,12 @@ aes_gcm_dec_128_kernel:
aese v2.16b, v21.16b
aesmc v2.16b, v2.16b //AES block 2 - round 3
ldr q14, [x3, #80] //load h3l | h3h
+#ifndef __AARCH64EB__
ext v14.16b, v14.16b, v14.16b, #8
-
+#endif
aese v0.16b, v22.16b
aesmc v0.16b, v0.16b //AES block 0 - round 4
- ldr q26, [x8, #128] //load rk8
+ ld1 {v27.4s}, [x8], #16 //load rk9
aese v1.16b, v23.16b
aesmc v1.16b, v1.16b //AES block 1 - round 5
@@ -1073,8 +1134,9 @@ aes_gcm_dec_128_kernel:
aese v2.16b, v23.16b
aesmc v2.16b, v2.16b //AES block 2 - round 5
ldr q12, [x3, #32] //load h1l | h1h
+#ifndef __AARCH64EB__
ext v12.16b, v12.16b, v12.16b, #8
-
+#endif
aese v3.16b, v23.16b
aesmc v3.16b, v3.16b //AES block 3 - round 5
@@ -1092,7 +1154,9 @@ aes_gcm_dec_128_kernel:
trn1 v8.2d, v12.2d, v13.2d //h2h | h1h
ldr q15, [x3, #112] //load h4l | h4h
+#ifndef __AARCH64EB__
ext v15.16b, v15.16b, v15.16b, #8
+#endif
trn2 v16.2d, v12.2d, v13.2d //h2l | h1l
add x5, x5, x0
@@ -1134,12 +1198,10 @@ aes_gcm_dec_128_kernel:
eor v17.16b, v17.16b, v9.16b //h4k | h3k
b.ge .L128_dec_tail //handle tail
- ldr q5, [x0, #16] //AES block 1 - load ciphertext
-
- ldr q4, [x0, #0] //AES block 0 - load ciphertext
+ ld1 {v4.16b, v5.16b}, [x0], #32 //AES block 0 - load ciphertext; AES block 1 - load ciphertext
eor v1.16b, v5.16b, v1.16b //AES block 1 - result
- ldr q6, [x0, #32] //AES block 2 - load ciphertext
+ ld1 {v6.16b}, [x0], #16 //AES block 2 - load ciphertext
eor v0.16b, v4.16b, v0.16b //AES block 0 - result
rev64 v4.16b, v4.16b //GHASH block 0
@@ -1147,10 +1209,9 @@ aes_gcm_dec_128_kernel:
orr x9, x11, x9, lsl #32 //CTR block 4
add w12, w12, #1 //CTR block 4
- ldr q7, [x0, #48] //AES block 3 - load ciphertext
+ ld1 {v7.16b}, [x0], #16 //AES block 3 - load ciphertext
rev64 v5.16b, v5.16b //GHASH block 1
- add x0, x0, #64 //AES input_ptr update
mov x19, v1.d[0] //AES block 1 - mov low
mov x20, v1.d[1] //AES block 1 - mov high
@@ -1165,7 +1226,9 @@ aes_gcm_dec_128_kernel:
fmov v0.d[1], x9 //CTR block 4
rev w9, w12 //CTR block 5
eor x19, x19, x13 //AES block 1 - round 10 low
-
+#ifdef __AARCH64EB__
+ rev x19, x19
+#endif
fmov d1, x10 //CTR block 5
add w12, w12, #1 //CTR block 5
orr x9, x11, x9, lsl #32 //CTR block 5
@@ -1177,10 +1240,19 @@ aes_gcm_dec_128_kernel:
orr x9, x11, x9, lsl #32 //CTR block 6
eor x20, x20, x14 //AES block 1 - round 10 high
+#ifdef __AARCH64EB__
+ rev x20, x20
+#endif
eor x6, x6, x13 //AES block 0 - round 10 low
+#ifdef __AARCH64EB__
+ rev x6, x6
+#endif
eor v2.16b, v6.16b, v2.16b //AES block 2 - result
eor x7, x7, x14 //AES block 0 - round 10 high
+#ifdef __AARCH64EB__
+ rev x7, x7
+#endif
stp x6, x7, [x2], #16 //AES block 0 - store result
stp x19, x20, [x2], #16 //AES block 1 - store result
@@ -1248,9 +1320,14 @@ aes_gcm_dec_128_kernel:
aese v3.16b, v19.16b
aesmc v3.16b, v3.16b //AES block 4k+7 - round 1
eor x23, x23, x13 //AES block 4k+3 - round 10 low
-
+#ifdef __AARCH64EB__
+ rev x23, x23
+#endif
pmull v30.1q, v30.1d, v17.1d //GHASH block 4k+1 - mid
eor x22, x22, x14 //AES block 4k+2 - round 10 high
+#ifdef __AARCH64EB__
+ rev x22, x22
+#endif
mov d31, v6.d[1] //GHASH block 4k+2 - mid
aese v0.16b, v19.16b
@@ -1288,7 +1365,9 @@ aes_gcm_dec_128_kernel:
pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid
eor x24, x24, x14 //AES block 4k+3 - round 10 high
-
+#ifdef __AARCH64EB__
+ rev x24, x24
+#endif
aese v2.16b, v20.16b
aesmc v2.16b, v2.16b //AES block 4k+6 - round 2
eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid
@@ -1296,7 +1375,9 @@ aes_gcm_dec_128_kernel:
aese v1.16b, v23.16b
aesmc v1.16b, v1.16b //AES block 4k+5 - round 5
eor x21, x21, x13 //AES block 4k+2 - round 10 low
-
+#ifdef __AARCH64EB__
+ rev x21, x21
+#endif
aese v0.16b, v23.16b
aesmc v0.16b, v0.16b //AES block 4k+4 - round 5
movi v8.8b, #0xc2
@@ -1318,7 +1399,7 @@ aes_gcm_dec_128_kernel:
pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid
eor v9.16b, v9.16b, v4.16b //GHASH block 4k+3 - high
- ldr q4, [x0, #0] //AES block 4k+4 - load ciphertext
+ ld1 {v4.16b}, [x0], #16 //AES block 4k+3 - load ciphertext
aese v1.16b, v25.16b
aesmc v1.16b, v1.16b //AES block 4k+5 - round 7
@@ -1345,7 +1426,7 @@ aes_gcm_dec_128_kernel:
rev w9, w12 //CTR block 4k+8
pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid
- ldr q5, [x0, #16] //AES block 4k+5 - load ciphertext
+ ld1 {v5.16b}, [x0], #16 //AES block 4k+4 - load ciphertext
ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment
aese v0.16b, v27.16b //AES block 4k+4 - round 9
@@ -1363,7 +1444,7 @@ aes_gcm_dec_128_kernel:
aese v3.16b, v23.16b
aesmc v3.16b, v3.16b //AES block 4k+7 - round 5
- ldr q6, [x0, #32] //AES block 4k+6 - load ciphertext
+ ld1 {v6.16b}, [x0], #16 //AES block 4k+5 - load ciphertext
add w12, w12, #1 //CTR block 4k+8
eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid
@@ -1371,11 +1452,10 @@ aes_gcm_dec_128_kernel:
aese v2.16b, v25.16b
aesmc v2.16b, v2.16b //AES block 4k+6 - round 7
- ldr q7, [x0, #48] //AES block 4k+3 - load ciphertext
+ ld1 {v7.16b}, [x0], #16 //AES block 4k+6 - load ciphertext
aese v3.16b, v24.16b
aesmc v3.16b, v3.16b //AES block 4k+7 - round 6
- add x0, x0, #64 //AES input_ptr update
rev64 v5.16b, v5.16b //GHASH block 4k+5
eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid
@@ -1400,11 +1480,15 @@ aes_gcm_dec_128_kernel:
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 4k+7 - round 8
eor x7, x7, x14 //AES block 4k+4 - round 10 high
-
+#ifdef __AARCH64EB__
+ rev x7, x7
+#endif
eor v11.16b, v11.16b, v8.16b //MODULO - fold into low
mov x20, v1.d[1] //AES block 4k+5 - mov high
eor x6, x6, x13 //AES block 4k+4 - round 10 low
-
+#ifdef __AARCH64EB__
+ rev x6, x6
+#endif
eor v2.16b, v6.16b, v2.16b //AES block 4k+6 - result
mov x19, v1.d[0] //AES block 4k+5 - mov low
add w12, w12, #1 //CTR block 4k+9
@@ -1421,9 +1505,15 @@ aes_gcm_dec_128_kernel:
add w12, w12, #1 //CTR block 4k+10
eor x20, x20, x14 //AES block 4k+5 - round 10 high
+#ifdef __AARCH64EB__
+ rev x20, x20
+#endif
stp x6, x7, [x2], #16 //AES block 4k+4 - store result
eor x19, x19, x13 //AES block 4k+5 - round 10 low
+#ifdef __AARCH64EB__
+ rev x19, x19
+#endif
stp x19, x20, [x2], #16 //AES block 4k+5 - store result
orr x9, x11, x9, lsl #32 //CTR block 4k+10
@@ -1528,9 +1618,14 @@ aes_gcm_dec_128_kernel:
aese v3.16b, v20.16b
aesmc v3.16b, v3.16b //AES block 4k+7 - round 2
eor x23, x23, x13 //AES block 4k+3 - round 10 low
-
+#ifdef __AARCH64EB__
+ rev x23, x23
+#endif
pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid
eor x21, x21, x13 //AES block 4k+2 - round 10 low
+#ifdef __AARCH64EB__
+ rev x21, x21
+#endif
eor v11.16b, v11.16b, v29.16b //GHASH block 4k+3 - low
aese v2.16b, v21.16b
@@ -1603,7 +1698,9 @@ aes_gcm_dec_128_kernel:
pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low
eor x24, x24, x14 //AES block 4k+3 - round 10 high
-
+#ifdef __AARCH64EB__
+ rev x24, x24
+#endif
aese v2.16b, v25.16b
aesmc v2.16b, v2.16b //AES block 4k+6 - round 7
ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment
@@ -1621,7 +1718,9 @@ aes_gcm_dec_128_kernel:
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 4k+7 - round 8
eor x22, x22, x14 //AES block 4k+2 - round 10 high
-
+#ifdef __AARCH64EB__
+ rev x22, x22
+#endif
aese v0.16b, v27.16b //AES block 4k+4 - round 9
stp x21, x22, [x2], #16 //AES block 4k+2 - store result
@@ -1645,9 +1744,14 @@ aes_gcm_dec_128_kernel:
cmp x5, #48
eor x7, x7, x14 //AES block 4k+4 - round 10 high
-
+#ifdef __AARCH64EB__
+ rev x7, x7
+#endif
ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag
eor x6, x6, x13 //AES block 4k+4 - round 10 low
+#ifdef __AARCH64EB__
+ rev x6, x6
+#endif
b.gt .L128_dec_blocks_more_than_3
mov v3.16b, v2.16b
@@ -1691,9 +1795,14 @@ aes_gcm_dec_128_kernel:
movi v8.8b, #0 //suppress further partial tag feed in
eor x7, x7, x14 //AES final-2 block - round 10 high
-
+#ifdef __AARCH64EB__
+ rev x7, x7
+#endif
pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid
eor x6, x6, x13 //AES final-2 block - round 10 low
+#ifdef __AARCH64EB__
+ rev x6, x6
+#endif
.L128_dec_blocks_more_than_2: //blocks left > 2
rev64 v4.16b, v5.16b //GHASH final-2 block
@@ -1719,12 +1828,18 @@ aes_gcm_dec_128_kernel:
pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid
eor x6, x6, x13 //AES final-1 block - round 10 low
+#ifdef __AARCH64EB__
+ rev x6, x6
+#endif
eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low
eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high
eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid
eor x7, x7, x14 //AES final-1 block - round 10 high
+#ifdef __AARCH64EB__
+ rev x7, x7
+#endif
.L128_dec_blocks_more_than_1: //blocks left > 1
rev64 v4.16b, v5.16b //GHASH final-1 block
@@ -1755,8 +1870,13 @@ aes_gcm_dec_128_kernel:
eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high
eor x7, x7, x14 //AES final block - round 10 high
-
+#ifdef __AARCH64EB__
+ rev x7, x7
+#endif
eor x6, x6, x13 //AES final block - round 10 low
+#ifdef __AARCH64EB__
+ rev x6, x6
+#endif
eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid
.L128_dec_blocks_less_than_1: //blocks left <= 1
@@ -1802,7 +1922,11 @@ aes_gcm_dec_128_kernel:
bic x4, x4, x9 //mask out low existing bytes
and x6, x6, x9
+#ifndef __AARCH64EB__
rev w9, w12
+#else
+ mov w9, w12
+#endif
eor v10.16b, v10.16b, v8.16b //GHASH final block - mid
movi v8.8b, #0xc2
@@ -1869,18 +1993,26 @@ aes_gcm_enc_192_kernel:
stp d14, d15, [sp, #96]
ldp x10, x11, [x16] //ctr96_b64, ctr96_t32
+#ifdef __AARCH64EB__
+ rev x10, x10
+ rev x11, x11
+#endif
+ ldp x13, x14, [x8, #192] //load rk12
+#ifdef __AARCH64EB__
+ ror x13, x13, #32
+ ror x14, x14, #32
+#endif
+ ld1 {v18.4s}, [x8], #16 //load rk0
- ldr q23, [x8, #80] //load rk5
-
- ldr q22, [x8, #64] //load rk4
+ ld1 {v19.4s}, [x8], #16 //load rk1
- ldr q26, [x8, #128] //load rk8
+ ld1 {v20.4s}, [x8], #16 //load rk2
lsr x12, x11, #32
- ldr q24, [x8, #96] //load rk6
+ ld1 {v21.4s}, [x8], #16 //load rk3
orr w11, w11, w11
- ldr q25, [x8, #112] //load rk7
+ ld1 {v22.4s}, [x8], #16 //load rk4
rev w12, w12 //rev_ctr32
add w12, w12, #1 //increment rev_ctr32
@@ -1904,15 +2036,13 @@ aes_gcm_enc_192_kernel:
rev w9, w12 //CTR block 3
orr x9, x11, x9, lsl #32 //CTR block 3
- ldr q18, [x8, #0] //load rk0
+ ld1 {v23.4s}, [x8], #16 //load rk5
fmov v3.d[1], x9 //CTR block 3
- ldr q21, [x8, #48] //load rk3
+ ld1 {v24.4s}, [x8], #16 //load rk6
- ldp x13, x14, [x8, #192] //load rk12
-
- ldr q19, [x8, #16] //load rk1
+ ld1 {v25.4s}, [x8], #16 //load rk7
aese v0.16b, v18.16b
aesmc v0.16b, v0.16b //AES block 0 - round 0
@@ -1922,35 +2052,38 @@ aes_gcm_enc_192_kernel:
aese v3.16b, v18.16b
aesmc v3.16b, v3.16b //AES block 3 - round 0
- ldr q29, [x8, #176] //load rk11
+ ld1 {v26.4s}, [x8], #16 //load rk8
aese v1.16b, v18.16b
aesmc v1.16b, v1.16b //AES block 1 - round 0
ldr q15, [x3, #112] //load h4l | h4h
+#ifndef __AARCH64EB__
ext v15.16b, v15.16b, v15.16b, #8
-
+#endif
aese v2.16b, v18.16b
aesmc v2.16b, v2.16b //AES block 2 - round 0
- ldr q20, [x8, #32] //load rk2
+ ld1 {v27.4s}, [x8], #16 //load rk9
aese v0.16b, v19.16b
aesmc v0.16b, v0.16b //AES block 0 - round 1
- ldr q28, [x8, #160] //load rk10
+ ld1 {v28.4s}, [x8], #16 //load rk10
aese v1.16b, v19.16b
aesmc v1.16b, v1.16b //AES block 1 - round 1
ldr q12, [x3, #32] //load h1l | h1h
+#ifndef __AARCH64EB__
ext v12.16b, v12.16b, v12.16b, #8
-
+#endif
aese v2.16b, v19.16b
aesmc v2.16b, v2.16b //AES block 2 - round 1
- ldr q27, [x8, #144] //load rk9
+ ld1 {v29.4s}, [x8], #16 //load rk11
aese v3.16b, v19.16b
aesmc v3.16b, v3.16b //AES block 3 - round 1
ldr q14, [x3, #80] //load h3l | h3h
+#ifndef __AARCH64EB__
ext v14.16b, v14.16b, v14.16b, #8
-
+#endif
aese v0.16b, v20.16b
aesmc v0.16b, v0.16b //AES block 0 - round 2
@@ -2007,8 +2140,9 @@ aes_gcm_enc_192_kernel:
aese v2.16b, v24.16b
aesmc v2.16b, v2.16b //AES block 2 - round 6
ldr q13, [x3, #64] //load h2l | h2h
+#ifndef __AARCH64EB__
ext v13.16b, v13.16b, v13.16b, #8
-
+#endif
aese v1.16b, v24.16b
aesmc v1.16b, v1.16b //AES block 1 - round 6
@@ -2088,13 +2222,26 @@ aes_gcm_enc_192_kernel:
rev w9, w12 //CTR block 4
ldp x6, x7, [x0, #0] //AES block 0 - load plaintext
-
+#ifdef __AARCH64EB__
+ rev x6, x6
+ rev x7, x7
+#endif
orr x9, x11, x9, lsl #32 //CTR block 4
ldp x21, x22, [x0, #32] //AES block 2 - load plaintext
-
+#ifdef __AARCH64EB__
+ rev x21, x21
+ rev x22, x22
+#endif
ldp x23, x24, [x0, #48] //AES block 3 - load plaintext
-
+#ifdef __AARCH64EB__
+ rev x23, x23
+ rev x24, x24
+#endif
ldp x19, x20, [x0, #16] //AES block 1 - load plaintext
+#ifdef __AARCH64EB__
+ rev x19, x19
+ rev x20, x20
+#endif
add x0, x0, #64 //AES input_ptr update
cmp x0, x5 //check if we have <= 8 blocks
@@ -2166,7 +2313,10 @@ aes_gcm_enc_192_kernel:
aese v1.16b, v18.16b
aesmc v1.16b, v1.16b //AES block 4k+5 - round 0
ldp x19, x20, [x0, #16] //AES block 4k+5 - load plaintext
-
+#ifdef __AARCH64EB__
+ rev x19, x19
+ rev x20, x20
+#endif
ext v11.16b, v11.16b, v11.16b, #8 //PRE 0
fmov d3, x10 //CTR block 4k+3
rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free)
@@ -2178,11 +2328,17 @@ aes_gcm_enc_192_kernel:
pmull2 v30.1q, v5.2d, v14.2d //GHASH block 4k+1 - high
rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free)
ldp x21, x22, [x0, #32] //AES block 4k+6 - load plaintext
-
+#ifdef __AARCH64EB__
+ rev x21, x21
+ rev x22, x22
+#endif
aese v0.16b, v18.16b
aesmc v0.16b, v0.16b //AES block 4k+4 - round 0
ldp x23, x24, [x0, #48] //AES block 4k+3 - load plaintext
-
+#ifdef __AARCH64EB__
+ rev x23, x23
+ rev x24, x24
+#endif
pmull v31.1q, v5.1d, v14.1d //GHASH block 4k+1 - low
eor v4.16b, v4.16b, v11.16b //PRE 1
@@ -2275,7 +2431,10 @@ aes_gcm_enc_192_kernel:
aese v1.16b, v22.16b
aesmc v1.16b, v1.16b //AES block 4k+5 - round 4
ldp x6, x7, [x0, #0] //AES block 4k+4 - load plaintext
-
+#ifdef __AARCH64EB__
+ rev x6, x6
+ rev x7, x7
+#endif
aese v0.16b, v24.16b
aesmc v0.16b, v0.16b //AES block 4k+4 - round 6
eor v11.16b, v11.16b, v8.16b //GHASH block 4k+2 - low
@@ -2640,7 +2799,10 @@ aes_gcm_enc_192_kernel:
sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process
ldp x6, x7, [x0], #16 //AES block 4k+4 - load plaintext
-
+#ifdef __AARCH64EB__
+ rev x6, x6
+ rev x7, x7
+#endif
eor x6, x6, x13 //AES block 4k+4 - round 12 low
eor x7, x7, x14 //AES block 4k+4 - round 12 high
@@ -2677,7 +2839,10 @@ aes_gcm_enc_192_kernel:
st1 { v5.16b}, [x2], #16 //AES final-3 block - store result
ldp x6, x7, [x0], #16 //AES final-2 block - load input low & high
-
+#ifdef __AARCH64EB__
+ rev x6, x6
+ rev x7, x7
+#endif
rev64 v4.16b, v5.16b //GHASH final-3 block
eor x6, x6, x13 //AES final-2 block - round 12 low
@@ -2708,7 +2873,10 @@ aes_gcm_enc_192_kernel:
rev64 v4.16b, v5.16b //GHASH final-2 block
ldp x6, x7, [x0], #16 //AES final-1 block - load input low & high
-
+#ifdef __AARCH64EB__
+ rev x6, x6
+ rev x7, x7
+#endif
eor v4.16b, v4.16b, v8.16b //feed in partial tag
eor x7, x7, x14 //AES final-1 block - round 12 high
@@ -2739,7 +2907,10 @@ aes_gcm_enc_192_kernel:
st1 { v5.16b}, [x2], #16 //AES final-1 block - store result
ldp x6, x7, [x0], #16 //AES final block - load input low & high
-
+#ifdef __AARCH64EB__
+ rev x6, x6
+ rev x7, x7
+#endif
rev64 v4.16b, v5.16b //GHASH final-1 block
eor x6, x6, x13 //AES final block - round 12 low
@@ -2771,7 +2942,11 @@ aes_gcm_enc_192_kernel:
.L192_enc_blocks_less_than_1: //blocks left <= 1
ld1 { v18.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored
+#ifndef __AARCH64EB__
rev w9, w12
+#else
+ mov w9, w12
+#endif
and x1, x1, #127 //bit_length %= 128
sub x1, x1, #128 //bit_length -= 128
@@ -2876,14 +3051,22 @@ aes_gcm_dec_192_kernel:
add x4, x0, x1, lsr #3 //end_input_ptr
ldp x10, x11, [x16] //ctr96_b64, ctr96_t32
-
+#ifdef __AARCH64EB__
+ rev x10, x10
+ rev x11, x11
+#endif
+ ldp x13, x14, [x8, #192] //load rk12
+#ifdef __AARCH64EB__
+ ror x13, x13, #32
+ ror x14, x14, #32
+#endif
ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible
- ldr q18, [x8, #0] //load rk0
+ ld1 {v18.4s}, [x8], #16 //load rk0
lsr x5, x1, #3 //byte_len
mov x15, x5
- ldr q20, [x8, #32] //load rk2
+ ld1 {v19.4s}, [x8], #16 //load rk1
lsr x12, x11, #32
orr w11, w11, w11
@@ -2893,7 +3076,7 @@ aes_gcm_dec_192_kernel:
fmov d1, x10 //CTR block 1
add w12, w12, #1 //increment rev_ctr32
- ldr q19, [x8, #16] //load rk1
+ ld1 {v20.4s}, [x8], #16 //load rk2
aese v0.16b, v18.16b
aesmc v0.16b, v0.16b //AES block 0 - round 0
@@ -2901,7 +3084,7 @@ aes_gcm_dec_192_kernel:
add w12, w12, #1 //CTR block 1
orr x9, x11, x9, lsl #32 //CTR block 1
- ldr q21, [x8, #48] //load rk3
+ ld1 {v21.4s}, [x8], #16 //load rk3
fmov v1.d[1], x9 //CTR block 1
rev w9, w12 //CTR block 2
@@ -2919,54 +3102,57 @@ aes_gcm_dec_192_kernel:
fmov v3.d[1], x9 //CTR block 3
- ldr q26, [x8, #128] //load rk8
+ ld1 {v22.4s}, [x8], #16 //load rk4
aese v0.16b, v20.16b
aesmc v0.16b, v0.16b //AES block 0 - round 2
aese v2.16b, v18.16b
aesmc v2.16b, v2.16b //AES block 2 - round 0
- ldr q29, [x8, #176] //load rk11
+ ld1 {v23.4s}, [x8], #16 //load rk5
aese v1.16b, v18.16b
aesmc v1.16b, v1.16b //AES block 1 - round 0
ldr q15, [x3, #112] //load h4l | h4h
+#ifndef __AARCH64EB__
ext v15.16b, v15.16b, v15.16b, #8
-
+#endif
aese v3.16b, v18.16b
aesmc v3.16b, v3.16b //AES block 3 - round 0
ldr q13, [x3, #64] //load h2l | h2h
+#ifndef __AARCH64EB__
ext v13.16b, v13.16b, v13.16b, #8
-
+#endif
aese v2.16b, v19.16b
aesmc v2.16b, v2.16b //AES block 2 - round 1
ldr q14, [x3, #80] //load h3l | h3h
+#ifndef __AARCH64EB__
ext v14.16b, v14.16b, v14.16b, #8
-
+#endif
aese v1.16b, v19.16b
aesmc v1.16b, v1.16b //AES block 1 - round 1
- ldp x13, x14, [x8, #192] //load rk12
aese v3.16b, v19.16b
aesmc v3.16b, v3.16b //AES block 3 - round 1
ldr q12, [x3, #32] //load h1l | h1h
+#ifndef __AARCH64EB__
ext v12.16b, v12.16b, v12.16b, #8
-
+#endif
aese v2.16b, v20.16b
aesmc v2.16b, v2.16b //AES block 2 - round 2
- ldr q28, [x8, #160] //load rk10
+ ld1 {v24.4s}, [x8], #16 //load rk6
aese v0.16b, v21.16b
aesmc v0.16b, v0.16b //AES block 0 - round 3
- ldr q27, [x8, #144] //load rk9
+ ld1 {v25.4s}, [x8], #16 //load rk7
aese v1.16b, v20.16b
aesmc v1.16b, v1.16b //AES block 1 - round 2
- ldr q25, [x8, #112] //load rk7
+ ld1 {v26.4s}, [x8], #16 //load rk8
aese v3.16b, v20.16b
aesmc v3.16b, v3.16b //AES block 3 - round 2
- ldr q22, [x8, #64] //load rk4
+ ld1 {v27.4s}, [x8], #16 //load rk9
aese v2.16b, v21.16b
aesmc v2.16b, v2.16b //AES block 2 - round 3
@@ -2984,7 +3170,7 @@ aes_gcm_dec_192_kernel:
aese v0.16b, v22.16b
aesmc v0.16b, v0.16b //AES block 0 - round 4
- ldr q23, [x8, #80] //load rk5
+ ld1 {v28.4s}, [x8], #16 //load rk10
aese v1.16b, v22.16b
aesmc v1.16b, v1.16b //AES block 1 - round 4
@@ -2999,7 +3185,7 @@ aes_gcm_dec_192_kernel:
aese v0.16b, v23.16b
aesmc v0.16b, v0.16b //AES block 0 - round 5
- ldr q24, [x8, #96] //load rk6
+ ld1 {v29.4s}, [x8], #16 //load rk11
aese v1.16b, v23.16b
aesmc v1.16b, v1.16b //AES block 1 - round 5
@@ -3086,17 +3272,13 @@ aes_gcm_dec_192_kernel:
aese v0.16b, v29.16b //AES block 0 - round 11
b.ge .L192_dec_tail //handle tail
- ldr q5, [x0, #16] //AES block 1 - load ciphertext
-
- ldr q4, [x0, #0] //AES block 0 - load ciphertext
+ ld1 {v4.16b, v5.16b}, [x0], #32 //AES block 0,1 - load ciphertext
eor v1.16b, v5.16b, v1.16b //AES block 1 - result
eor v0.16b, v4.16b, v0.16b //AES block 0 - result
rev w9, w12 //CTR block 4
- ldr q7, [x0, #48] //AES block 3 - load ciphertext
-
- ldr q6, [x0, #32] //AES block 2 - load ciphertext
+ ld1 {v6.16b, v7.16b}, [x0], #32 //AES block 2,3 - load ciphertext
mov x19, v1.d[0] //AES block 1 - mov low
@@ -3108,27 +3290,35 @@ aes_gcm_dec_192_kernel:
mov x7, v0.d[1] //AES block 0 - mov high
rev64 v4.16b, v4.16b //GHASH block 0
- add x0, x0, #64 //AES input_ptr update
fmov d0, x10 //CTR block 4
rev64 v5.16b, v5.16b //GHASH block 1
cmp x0, x5 //check if we have <= 8 blocks
eor x19, x19, x13 //AES block 1 - round 12 low
+#ifdef __AARCH64EB__
+ rev x19, x19
+#endif
fmov v0.d[1], x9 //CTR block 4
rev w9, w12 //CTR block 5
orr x9, x11, x9, lsl #32 //CTR block 5
fmov d1, x10 //CTR block 5
eor x20, x20, x14 //AES block 1 - round 12 high
-
+#ifdef __AARCH64EB__
+ rev x20, x20
+#endif
add w12, w12, #1 //CTR block 5
fmov v1.d[1], x9 //CTR block 5
eor x6, x6, x13 //AES block 0 - round 12 low
-
+#ifdef __AARCH64EB__
+ rev x6, x6
+#endif
rev w9, w12 //CTR block 6
eor x7, x7, x14 //AES block 0 - round 12 high
-
+#ifdef __AARCH64EB__
+ rev x7, x7
+#endif
stp x6, x7, [x2], #16 //AES block 0 - store result
orr x9, x11, x9, lsl #32 //CTR block 6
@@ -3191,7 +3381,9 @@ aes_gcm_dec_192_kernel:
aese v0.16b, v20.16b
aesmc v0.16b, v0.16b //AES block 4k+4 - round 2
eor x22, x22, x14 //AES block 4k+2 - round 12 high
-
+#ifdef __AARCH64EB__
+ rev x22, x22
+#endif
aese v2.16b, v19.16b
aesmc v2.16b, v2.16b //AES block 4k+6 - round 1
eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid
@@ -3208,7 +3400,9 @@ aes_gcm_dec_192_kernel:
pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid
eor v11.16b, v11.16b, v31.16b //GHASH block 4k+1 - low
eor x21, x21, x13 //AES block 4k+2 - round 12 low
-
+#ifdef __AARCH64EB__
+ rev x21, x21
+#endif
aese v1.16b, v22.16b
aesmc v1.16b, v1.16b //AES block 4k+5 - round 4
@@ -3310,16 +3504,18 @@ aes_gcm_dec_192_kernel:
aese v2.16b, v24.16b
aesmc v2.16b, v2.16b //AES block 4k+6 - round 6
- ldr q6, [x0, #32] //AES block 4k+6 - load ciphertext
+ ld1 {v4.16b}, [x0], #16 //AES block 4k+4 - load ciphertext
aese v3.16b, v24.16b
aesmc v3.16b, v3.16b //AES block 4k+7 - round 6
eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up
pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid
- ldr q7, [x0, #48] //AES block 4k+7 - load ciphertext
+ ld1 {v5.16b}, [x0], #16 //AES block 4k+5 - load ciphertext
eor x23, x23, x13 //AES block 4k+3 - round 12 low
-
+#ifdef __AARCH64EB__
+ rev x23, x23
+#endif
aese v2.16b, v25.16b
aesmc v2.16b, v2.16b //AES block 4k+6 - round 7
ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment
@@ -3333,10 +3529,10 @@ aes_gcm_dec_192_kernel:
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 4k+6 - round 8
- ldr q4, [x0, #0] //AES block 4k+4 - load ciphertext
+ ld1 {v6.16b}, [x0], #16 //AES block 4k+6 - load ciphertext
aese v1.16b, v29.16b //AES block 4k+5 - round 11
- ldr q5, [x0, #16] //AES block 4k+5 - load ciphertext
+ ld1 {v7.16b}, [x0], #16 //AES block 4k+7 - load ciphertext
rev w9, w12 //CTR block 4k+8
aese v3.16b, v26.16b
@@ -3347,11 +3543,13 @@ aes_gcm_dec_192_kernel:
aesmc v2.16b, v2.16b //AES block 4k+6 - round 9
eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid
- add x0, x0, #64 //AES input_ptr update
cmp x0, x5 //.LOOP CONTROL
eor v0.16b, v4.16b, v0.16b //AES block 4k+4 - result
eor x24, x24, x14 //AES block 4k+3 - round 12 high
+#ifdef __AARCH64EB__
+ rev x24, x24
+#endif
eor v1.16b, v5.16b, v1.16b //AES block 4k+5 - result
aese v2.16b, v28.16b
@@ -3384,18 +3582,28 @@ aes_gcm_dec_192_kernel:
rev w9, w12 //CTR block 4k+9
eor x6, x6, x13 //AES block 4k+4 - round 12 low
+#ifdef __AARCH64EB__
+ rev x6, x6
+#endif
orr x9, x11, x9, lsl #32 //CTR block 4k+9
eor v11.16b, v11.16b, v8.16b //MODULO - fold into low
fmov d1, x10 //CTR block 4k+9
add w12, w12, #1 //CTR block 4k+9
eor x19, x19, x13 //AES block 4k+5 - round 12 low
-
+#ifdef __AARCH64EB__
+ rev x19, x19
+#endif
fmov v1.d[1], x9 //CTR block 4k+9
rev w9, w12 //CTR block 4k+10
eor x20, x20, x14 //AES block 4k+5 - round 12 high
-
+#ifdef __AARCH64EB__
+ rev x20, x20
+#endif
eor x7, x7, x14 //AES block 4k+4 - round 12 high
+#ifdef __AARCH64EB__
+ rev x7, x7
+#endif
stp x6, x7, [x2], #16 //AES block 4k+4 - store result
eor v11.16b, v11.16b, v10.16b //MODULO - fold into low
@@ -3449,18 +3657,29 @@ aes_gcm_dec_192_kernel:
pmull v31.1q, v5.1d, v14.1d //GHASH block 4k+1 - low
eor x24, x24, x14 //AES block 4k+3 - round 12 high
+#ifdef __AARCH64EB__
+ rev x24, x24
+#endif
fmov v3.d[1], x9 //CTR block 4k+7
aese v0.16b, v20.16b
aesmc v0.16b, v0.16b //AES block 4k+4 - round 2
eor x21, x21, x13 //AES block 4k+2 - round 12 low
-
+#ifdef __AARCH64EB__
+ rev x21, x21
+#endif
pmull2 v30.1q, v5.2d, v14.2d //GHASH block 4k+1 - high
eor x22, x22, x14 //AES block 4k+2 - round 12 high
+#ifdef __AARCH64EB__
+ rev x22, x22
+#endif
eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid
pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid
eor x23, x23, x13 //AES block 4k+3 - round 12 low
+#ifdef __AARCH64EB__
+ rev x23, x23
+#endif
stp x21, x22, [x2], #16 //AES block 4k+2 - store result
rev64 v7.16b, v7.16b //GHASH block 4k+3
@@ -3650,8 +3869,13 @@ aes_gcm_dec_192_kernel:
cmp x5, #48
eor x7, x7, x14 //AES block 4k+4 - round 12 high
-
+#ifdef __AARCH64EB__
+ rev x7, x7
+#endif
eor x6, x6, x13 //AES block 4k+4 - round 12 low
+#ifdef __AARCH64EB__
+ rev x6, x6
+#endif
b.gt .L192_dec_blocks_more_than_3
movi v11.8b, #0
@@ -3695,10 +3919,16 @@ aes_gcm_dec_192_kernel:
pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high
eor x6, x6, x13 //AES final-2 block - round 12 low
+#ifdef __AARCH64EB__
+ rev x6, x6
+#endif
movi v8.8b, #0 //suppress further partial tag feed in
pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid
eor x7, x7, x14 //AES final-2 block - round 12 high
+#ifdef __AARCH64EB__
+ rev x7, x7
+#endif
.L192_dec_blocks_more_than_2: //blocks left > 2
rev64 v4.16b, v5.16b //GHASH final-2 block
@@ -3728,8 +3958,13 @@ aes_gcm_dec_192_kernel:
eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high
eor x7, x7, x14 //AES final-1 block - round 12 high
-
+#ifdef __AARCH64EB__
+ rev x7, x7
+#endif
eor x6, x6, x13 //AES final-1 block - round 12 low
+#ifdef __AARCH64EB__
+ rev x6, x6
+#endif
eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid
.L192_dec_blocks_more_than_1: //blocks left > 1
@@ -3760,9 +3995,13 @@ aes_gcm_dec_192_kernel:
movi v8.8b, #0 //suppress further partial tag feed in
eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low
eor x7, x7, x14 //AES final block - round 12 high
-
+#ifdef __AARCH64EB__
+ rev x7, x7
+#endif
eor x6, x6, x13 //AES final block - round 12 low
-
+#ifdef __AARCH64EB__
+ rev x6, x6
+#endif
eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid
.L192_dec_blocks_less_than_1: //blocks left <= 1
@@ -3789,8 +4028,11 @@ aes_gcm_dec_192_kernel:
orr x6, x6, x4
mov v0.d[1], x10
-
+#ifndef __AARCH64EB__
rev w9, w12
+#else
+ mov w9, w12
+#endif
and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits
str w9, [x16, #12] //store the updated counter
@@ -3878,14 +4120,22 @@ aes_gcm_enc_256_kernel:
lsr x5, x1, #3 //byte_len
mov x15, x5
ldp x10, x11, [x16] //ctr96_b64, ctr96_t32
-
+#ifdef __AARCH64EB__
+ rev x10, x10
+ rev x11, x11
+#endif
+ ldp x13, x14, [x8, #224] //load rk14
+#ifdef __AARCH64EB__
+ ror x13, x13, #32
+ ror x14, x14, #32
+#endif
ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible
sub x5, x5, #1 //byte_len - 1
- ldr q18, [x8, #0] //load rk0
+ ld1 {v18.4s}, [x8], #16 //load rk0
and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
- ldr q25, [x8, #112] //load rk7
+ ld1 {v19.4s}, [x8], #16 //load rk1
add x5, x5, x0
lsr x12, x11, #32
@@ -3905,14 +4155,14 @@ aes_gcm_enc_256_kernel:
orr x9, x11, x9, lsl #32 //CTR block 1
add w12, w12, #1 //CTR block 1
- ldr q19, [x8, #16] //load rk1
+ ld1 {v20.4s}, [x8], #16 //load rk2
fmov v1.d[1], x9 //CTR block 1
rev w9, w12 //CTR block 2
add w12, w12, #1 //CTR block 2
orr x9, x11, x9, lsl #32 //CTR block 2
- ldr q20, [x8, #32] //load rk2
+ ld1 {v21.4s}, [x8], #16 //load rk3
fmov v2.d[1], x9 //CTR block 2
rev w9, w12 //CTR block 3
@@ -3925,50 +4175,53 @@ aes_gcm_enc_256_kernel:
aese v1.16b, v18.16b
aesmc v1.16b, v1.16b //AES block 1 - round 0
- ldr q21, [x8, #48] //load rk3
+ ld1 {v22.4s}, [x8], #16 //load rk4
aese v0.16b, v20.16b
aesmc v0.16b, v0.16b //AES block 0 - round 2
- ldr q24, [x8, #96] //load rk6
+ ld1 {v23.4s}, [x8], #16 //load rk5
aese v2.16b, v18.16b
aesmc v2.16b, v2.16b //AES block 2 - round 0
- ldr q23, [x8, #80] //load rk5
+ ld1 {v24.4s}, [x8], #16 //load rk6
aese v1.16b, v19.16b
aesmc v1.16b, v1.16b //AES block 1 - round 1
ldr q14, [x3, #80] //load h3l | h3h
+#ifndef __AARCH64EB__
ext v14.16b, v14.16b, v14.16b, #8
-
+#endif
aese v3.16b, v18.16b
aesmc v3.16b, v3.16b //AES block 3 - round 0
- ldr q31, [x8, #208] //load rk13
+ ld1 {v25.4s}, [x8], #16 //load rk7
aese v2.16b, v19.16b
aesmc v2.16b, v2.16b //AES block 2 - round 1
- ldr q22, [x8, #64] //load rk4
+ ld1 {v26.4s}, [x8], #16 //load rk8
aese v1.16b, v20.16b
aesmc v1.16b, v1.16b //AES block 1 - round 2
ldr q13, [x3, #64] //load h2l | h2h
+#ifndef __AARCH64EB__
ext v13.16b, v13.16b, v13.16b, #8
-
+#endif
aese v3.16b, v19.16b
aesmc v3.16b, v3.16b //AES block 3 - round 1
- ldr q30, [x8, #192] //load rk12
+ ld1 {v27.4s}, [x8], #16 //load rk9
aese v2.16b, v20.16b
aesmc v2.16b, v2.16b //AES block 2 - round 2
ldr q15, [x3, #112] //load h4l | h4h
+#ifndef __AARCH64EB__
ext v15.16b, v15.16b, v15.16b, #8
-
+#endif
aese v1.16b, v21.16b
aesmc v1.16b, v1.16b //AES block 1 - round 3
- ldr q29, [x8, #176] //load rk11
+ ld1 {v28.4s}, [x8], #16 //load rk10
aese v3.16b, v20.16b
aesmc v3.16b, v3.16b //AES block 3 - round 2
- ldr q26, [x8, #128] //load rk8
+ ld1 {v29.4s}, [x8], #16 //load rk11
aese v2.16b, v21.16b
aesmc v2.16b, v2.16b //AES block 2 - round 3
@@ -3976,7 +4229,6 @@ aes_gcm_enc_256_kernel:
aese v0.16b, v21.16b
aesmc v0.16b, v0.16b //AES block 0 - round 3
- ldp x13, x14, [x8, #224] //load rk14
aese v3.16b, v21.16b
aesmc v3.16b, v3.16b //AES block 3 - round 3
@@ -4014,16 +4266,17 @@ aes_gcm_enc_256_kernel:
aese v3.16b, v24.16b
aesmc v3.16b, v3.16b //AES block 3 - round 6
- ldr q27, [x8, #144] //load rk9
+ ld1 {v30.4s}, [x8], #16 //load rk12
aese v0.16b, v24.16b
aesmc v0.16b, v0.16b //AES block 0 - round 6
ldr q12, [x3, #32] //load h1l | h1h
+#ifndef __AARCH64EB__
ext v12.16b, v12.16b, v12.16b, #8
-
+#endif
aese v2.16b, v24.16b
aesmc v2.16b, v2.16b //AES block 2 - round 6
- ldr q28, [x8, #160] //load rk10
+ ld1 {v31.4s}, [x8], #16 //load rk13
aese v1.16b, v25.16b
aesmc v1.16b, v1.16b //AES block 1 - round 7
@@ -4112,13 +4365,26 @@ aes_gcm_enc_256_kernel:
b.ge .L256_enc_tail //handle tail
ldp x19, x20, [x0, #16] //AES block 1 - load plaintext
-
+#ifdef __AARCH64EB__
+ rev x19, x19
+ rev x20, x20
+#endif
rev w9, w12 //CTR block 4
ldp x6, x7, [x0, #0] //AES block 0 - load plaintext
-
+#ifdef __AARCH64EB__
+ rev x6, x6
+ rev x7, x7
+#endif
ldp x23, x24, [x0, #48] //AES block 3 - load plaintext
-
+#ifdef __AARCH64EB__
+ rev x23, x23
+ rev x24, x24
+#endif
ldp x21, x22, [x0, #32] //AES block 2 - load plaintext
+#ifdef __AARCH64EB__
+ rev x21, x21
+ rev x22, x22
+#endif
add x0, x0, #64 //AES input_ptr update
eor x19, x19, x13 //AES block 1 - round 14 low
@@ -4201,11 +4467,17 @@ aes_gcm_enc_256_kernel:
aese v1.16b, v19.16b
aesmc v1.16b, v1.16b //AES block 4k+5 - round 1
ldp x23, x24, [x0, #48] //AES block 4k+7 - load plaintext
-
+#ifdef __AARCH64EB__
+ rev x23, x23
+ rev x24, x24
+#endif
aese v2.16b, v19.16b
aesmc v2.16b, v2.16b //AES block 4k+6 - round 1
ldp x21, x22, [x0, #32] //AES block 4k+6 - load plaintext
-
+#ifdef __AARCH64EB__
+ rev x21, x21
+ rev x22, x22
+#endif
aese v0.16b, v20.16b
aesmc v0.16b, v0.16b //AES block 4k+4 - round 2
eor v4.16b, v4.16b, v11.16b //PRE 1
@@ -4315,7 +4587,10 @@ aes_gcm_enc_256_kernel:
aese v3.16b, v24.16b
aesmc v3.16b, v3.16b //AES block 4k+7 - round 6
ldp x19, x20, [x0, #16] //AES block 4k+5 - load plaintext
-
+#ifdef __AARCH64EB__
+ rev x19, x19
+ rev x20, x20
+#endif
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 4k+5 - round 8
mov d4, v7.d[1] //GHASH block 4k+3 - mid
@@ -4352,7 +4627,10 @@ aes_gcm_enc_256_kernel:
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 4k+6 - round 8
ldp x6, x7, [x0, #0] //AES block 4k+4 - load plaintext
-
+#ifdef __AARCH64EB__
+ rev x6, x6
+ rev x7, x7
+#endif
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 4k+4 - round 10
shl d8, d8, #56 //mod_constant
@@ -4714,7 +4992,10 @@ aes_gcm_enc_256_kernel:
ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag
sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process
ldp x6, x7, [x0], #16 //AES block 4k+4 - load plaintext
-
+#ifdef __AARCH64EB__
+ rev x6, x6
+ rev x7, x7
+#endif
eor x6, x6, x13 //AES block 4k+4 - round 14 low
eor x7, x7, x14 //AES block 4k+4 - round 14 high
@@ -4749,7 +5030,10 @@ aes_gcm_enc_256_kernel:
st1 { v5.16b}, [x2], #16 //AES final-3 block - store result
ldp x6, x7, [x0], #16 //AES final-2 block - load input low & high
-
+#ifdef __AARCH64EB__
+ rev x6, x6
+ rev x7, x7
+#endif
rev64 v4.16b, v5.16b //GHASH final-3 block
eor x6, x6, x13 //AES final-2 block - round 14 low
@@ -4778,7 +5062,10 @@ aes_gcm_enc_256_kernel:
st1 { v5.16b}, [x2], #16 //AES final-2 block - store result
ldp x6, x7, [x0], #16 //AES final-1 block - load input low & high
-
+#ifdef __AARCH64EB__
+ rev x6, x6
+ rev x7, x7
+#endif
rev64 v4.16b, v5.16b //GHASH final-2 block
eor x6, x6, x13 //AES final-1 block - round 14 low
@@ -4814,7 +5101,10 @@ aes_gcm_enc_256_kernel:
rev64 v4.16b, v5.16b //GHASH final-1 block
ldp x6, x7, [x0], #16 //AES final block - load input low & high
-
+#ifdef __AARCH64EB__
+ rev x6, x6
+ rev x7, x7
+#endif
eor v4.16b, v4.16b, v8.16b //feed in partial tag
movi v8.8b, #0 //suppress further partial tag feed in
@@ -4875,7 +5165,11 @@ aes_gcm_enc_256_kernel:
pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high
mov d8, v4.d[1] //GHASH final block - mid
+#ifndef __AARCH64EB__
rev w9, w12
+#else
+ mov w9, w12
+#endif
pmull v21.1q, v4.1d, v12.1d //GHASH final block - low
@@ -4949,21 +5243,29 @@ aes_gcm_dec_256_kernel:
lsr x5, x1, #3 //byte_len
mov x15, x5
ldp x10, x11, [x16] //ctr96_b64, ctr96_t32
-
- ldr q26, [x8, #128] //load rk8
+#ifdef __AARCH64EB__
+ rev x10, x10
+ rev x11, x11
+#endif
+ ldp x13, x14, [x8, #224] //load rk14
+#ifdef __AARCH64EB__
+ ror x14, x14, #32
+ ror x13, x13, #32
+#endif
+ ld1 {v18.4s}, [x8], #16 //load rk0
sub x5, x5, #1 //byte_len - 1
- ldr q25, [x8, #112] //load rk7
+ ld1 {v19.4s}, [x8], #16 //load rk1
and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
add x4, x0, x1, lsr #3 //end_input_ptr
- ldr q24, [x8, #96] //load rk6
+ ld1 {v20.4s}, [x8], #16 //load rk2
lsr x12, x11, #32
- ldr q23, [x8, #80] //load rk5
+ ld1 {v21.4s}, [x8], #16 //load rk3
orr w11, w11, w11
- ldr q21, [x8, #48] //load rk3
+ ld1 {v22.4s}, [x8], #16 //load rk4
add x5, x5, x0
rev w12, w12 //rev_ctr32
@@ -4988,39 +5290,44 @@ aes_gcm_dec_256_kernel:
rev w9, w12 //CTR block 3
orr x9, x11, x9, lsl #32 //CTR block 3
- ldr q18, [x8, #0] //load rk0
+ ld1 {v23.4s}, [x8], #16 //load rk5
fmov v3.d[1], x9 //CTR block 3
add w12, w12, #1 //CTR block 3
- ldr q22, [x8, #64] //load rk4
+ ld1 {v24.4s}, [x8], #16 //load rk6
- ldr q31, [x8, #208] //load rk13
+ ld1 {v25.4s}, [x8], #16 //load rk7
- ldr q19, [x8, #16] //load rk1
+ ld1 {v26.4s}, [x8], #16 //load rk8
aese v0.16b, v18.16b
aesmc v0.16b, v0.16b //AES block 0 - round 0
ldr q14, [x3, #80] //load h3l | h3h
+#ifndef __AARCH64EB__
ext v14.16b, v14.16b, v14.16b, #8
+#endif
aese v3.16b, v18.16b
aesmc v3.16b, v3.16b //AES block 3 - round 0
ldr q15, [x3, #112] //load h4l | h4h
+#ifndef __AARCH64EB__
ext v15.16b, v15.16b, v15.16b, #8
+#endif
aese v1.16b, v18.16b
aesmc v1.16b, v1.16b //AES block 1 - round 0
ldr q13, [x3, #64] //load h2l | h2h
+#ifndef __AARCH64EB__
ext v13.16b, v13.16b, v13.16b, #8
+#endif
aese v2.16b, v18.16b
aesmc v2.16b, v2.16b //AES block 2 - round 0
- ldr q20, [x8, #32] //load rk2
+ ld1 {v27.4s}, [x8], #16 //load rk9
aese v0.16b, v19.16b
aesmc v0.16b, v0.16b //AES block 0 - round 1
- ldp x13, x14, [x8, #224] //load rk14
aese v1.16b, v19.16b
aesmc v1.16b, v1.16b //AES block 1 - round 1
@@ -5030,20 +5337,21 @@ aes_gcm_dec_256_kernel:
aese v2.16b, v19.16b
aesmc v2.16b, v2.16b //AES block 2 - round 1
- ldr q27, [x8, #144] //load rk9
+ ld1 {v28.4s}, [x8], #16 //load rk10
aese v3.16b, v19.16b
aesmc v3.16b, v3.16b //AES block 3 - round 1
- ldr q30, [x8, #192] //load rk12
+ ld1 {v29.4s}, [x8], #16 //load rk11
aese v0.16b, v20.16b
aesmc v0.16b, v0.16b //AES block 0 - round 2
ldr q12, [x3, #32] //load h1l | h1h
+#ifndef __AARCH64EB__
ext v12.16b, v12.16b, v12.16b, #8
-
+#endif
aese v2.16b, v20.16b
aesmc v2.16b, v2.16b //AES block 2 - round 2
- ldr q28, [x8, #160] //load rk10
+ ld1 {v30.4s}, [x8], #16 //load rk12
aese v3.16b, v20.16b
aesmc v3.16b, v3.16b //AES block 3 - round 2
@@ -5126,7 +5434,7 @@ aes_gcm_dec_256_kernel:
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 2 - round 8
- ldr q29, [x8, #176] //load rk11
+ ld1 {v31.4s}, [x8], #16 //load rk13
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 1 - round 9
@@ -5191,9 +5499,7 @@ aes_gcm_dec_256_kernel:
aese v0.16b, v31.16b //AES block 0 - round 13
b.ge .L256_dec_tail //handle tail
- ldr q4, [x0, #0] //AES block 0 - load ciphertext
-
- ldr q5, [x0, #16] //AES block 1 - load ciphertext
+ ld1 {v4.16b, v5.16b}, [x0], #32 //AES block 0,1 - load ciphertext
rev w9, w12 //CTR block 4
@@ -5201,7 +5507,7 @@ aes_gcm_dec_256_kernel:
eor v1.16b, v5.16b, v1.16b //AES block 1 - result
rev64 v5.16b, v5.16b //GHASH block 1
- ldr q7, [x0, #48] //AES block 3 - load ciphertext
+ ld1 {v6.16b}, [x0], #16 //AES block 2 - load ciphertext
mov x7, v0.d[1] //AES block 0 - mov high
@@ -5221,22 +5527,32 @@ aes_gcm_dec_256_kernel:
orr x9, x11, x9, lsl #32 //CTR block 5
mov x20, v1.d[1] //AES block 1 - mov high
eor x7, x7, x14 //AES block 0 - round 14 high
-
+#ifdef __AARCH64EB__
+ rev x7, x7
+#endif
eor x6, x6, x13 //AES block 0 - round 14 low
+#ifdef __AARCH64EB__
+ rev x6, x6
+#endif
stp x6, x7, [x2], #16 //AES block 0 - store result
fmov d1, x10 //CTR block 5
- ldr q6, [x0, #32] //AES block 2 - load ciphertext
- add x0, x0, #64 //AES input_ptr update
+ ld1 {v7.16b}, [x0], #16 //AES block 3 - load ciphertext
fmov v1.d[1], x9 //CTR block 5
rev w9, w12 //CTR block 6
add w12, w12, #1 //CTR block 6
eor x19, x19, x13 //AES block 1 - round 14 low
+#ifdef __AARCH64EB__
+ rev x19, x19
+#endif
orr x9, x11, x9, lsl #32 //CTR block 6
eor x20, x20, x14 //AES block 1 - round 14 high
+#ifdef __AARCH64EB__
+ rev x20, x20
+#endif
stp x19, x20, [x2], #16 //AES block 1 - store result
eor v2.16b, v6.16b, v2.16b //AES block 2 - result
@@ -5287,7 +5603,9 @@ aes_gcm_dec_256_kernel:
aese v0.16b, v21.16b
aesmc v0.16b, v0.16b //AES block 4k+4 - round 3
eor x22, x22, x14 //AES block 4k+2 - round 14 high
-
+#ifdef __AARCH64EB__
+ rev x22, x22
+#endif
aese v2.16b, v19.16b
aesmc v2.16b, v2.16b //AES block 4k+6 - round 1
mov d10, v17.d[1] //GHASH block 4k - mid
@@ -5299,7 +5617,9 @@ aes_gcm_dec_256_kernel:
aese v3.16b, v18.16b
aesmc v3.16b, v3.16b //AES block 4k+7 - round 0
eor x21, x21, x13 //AES block 4k+2 - round 14 low
-
+#ifdef __AARCH64EB__
+ rev x21, x21
+#endif
aese v2.16b, v20.16b
aesmc v2.16b, v2.16b //AES block 4k+6 - round 2
stp x21, x22, [x2], #16 //AES block 4k+2 - store result
@@ -5314,9 +5634,14 @@ aes_gcm_dec_256_kernel:
pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid
eor x23, x23, x13 //AES block 4k+3 - round 14 low
-
+#ifdef __AARCH64EB__
+ rev x23, x23
+#endif
pmull v8.1q, v5.1d, v14.1d //GHASH block 4k+1 - low
eor x24, x24, x14 //AES block 4k+3 - round 14 high
+#ifdef __AARCH64EB__
+ rev x24, x24
+#endif
eor v9.16b, v9.16b, v4.16b //GHASH block 4k+1 - high
aese v2.16b, v22.16b
@@ -5437,7 +5762,7 @@ aes_gcm_dec_256_kernel:
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 4k+5 - round 9
- ldr q4, [x0, #0] //AES block 4k+4 - load ciphertext
+ ld1 {v4.16b}, [x0], #16 //AES block 4k+4 - load ciphertext
aese v0.16b, v31.16b //AES block 4k+4 - round 13
ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment
@@ -5448,7 +5773,7 @@ aes_gcm_dec_256_kernel:
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 4k+6 - round 9
- ldr q5, [x0, #16] //AES block 4k+5 - load ciphertext
+ ld1 {v5.16b}, [x0], #16 //AES block 4k+5 - load ciphertext
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 4k+7 - round 8
@@ -5464,11 +5789,11 @@ aes_gcm_dec_256_kernel:
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 4k+7 - round 9
- ldr q7, [x0, #48] //AES block 4k+7 - load ciphertext
+ ld1 {v6.16b}, [x0], #16 //AES block 4k+6 - load ciphertext
aese v1.16b, v30.16b
aesmc v1.16b, v1.16b //AES block 4k+5 - round 12
- ldr q6, [x0, #32] //AES block 4k+6 - load ciphertext
+ ld1 {v7.16b}, [x0], #16 //AES block 4k+7 - load ciphertext
aese v2.16b, v29.16b
aesmc v2.16b, v2.16b //AES block 4k+6 - round 11
@@ -5479,7 +5804,6 @@ aes_gcm_dec_256_kernel:
eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid
aese v1.16b, v31.16b //AES block 4k+5 - round 13
- add x0, x0, #64 //AES input_ptr update
mov x6, v0.d[0] //AES block 4k+4 - mov low
aese v2.16b, v30.16b
@@ -5501,8 +5825,13 @@ aes_gcm_dec_256_kernel:
add w12, w12, #1 //CTR block 4k+9
eor x6, x6, x13 //AES block 4k+4 - round 14 low
+#ifdef __AARCH64EB__
+ rev x6, x6
+#endif
eor x7, x7, x14 //AES block 4k+4 - round 14 high
-
+#ifdef __AARCH64EB__
+ rev x7, x7
+#endif
mov x20, v1.d[1] //AES block 4k+5 - mov high
eor v2.16b, v6.16b, v2.16b //AES block 4k+6 - result
eor v11.16b, v11.16b, v8.16b //MODULO - fold into low
@@ -5523,9 +5852,15 @@ aes_gcm_dec_256_kernel:
rev64 v5.16b, v5.16b //GHASH block 4k+5
eor x20, x20, x14 //AES block 4k+5 - round 14 high
+#ifdef __AARCH64EB__
+ rev x20, x20
+#endif
stp x6, x7, [x2], #16 //AES block 4k+4 - store result
eor x19, x19, x13 //AES block 4k+5 - round 14 low
+#ifdef __AARCH64EB__
+ rev x19, x19
+#endif
stp x19, x20, [x2], #16 //AES block 4k+5 - store result
rev64 v4.16b, v4.16b //GHASH block 4k+4
@@ -5732,11 +6067,15 @@ aes_gcm_dec_256_kernel:
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 4k+4 - round 10
eor x22, x22, x14 //AES block 4k+2 - round 14 high
-
+#ifdef __AARCH64EB__
+ rev x22, x22
+#endif
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 4k+5 - round 10
eor x23, x23, x13 //AES block 4k+3 - round 14 low
-
+#ifdef __AARCH64EB__
+ rev x23, x23
+#endif
aese v2.16b, v29.16b
aesmc v2.16b, v2.16b //AES block 4k+6 - round 11
eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid
@@ -5748,12 +6087,18 @@ aes_gcm_dec_256_kernel:
aese v1.16b, v29.16b
aesmc v1.16b, v1.16b //AES block 4k+5 - round 11
eor x21, x21, x13 //AES block 4k+2 - round 14 low
+#ifdef __AARCH64EB__
+ rev x21, x21
+#endif
aese v2.16b, v30.16b
aesmc v2.16b, v2.16b //AES block 4k+6 - round 12
pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low
eor x24, x24, x14 //AES block 4k+3 - round 14 high
+#ifdef __AARCH64EB__
+ rev x24, x24
+#endif
aese v3.16b, v29.16b
aesmc v3.16b, v3.16b //AES block 4k+7 - round 11
@@ -5794,8 +6139,14 @@ aes_gcm_dec_256_kernel:
cmp x5, #48
eor x6, x6, x13 //AES block 4k+4 - round 14 low
+#ifdef __AARCH64EB__
+ rev x6, x6
+#endif
eor x7, x7, x14 //AES block 4k+4 - round 14 high
+#ifdef __AARCH64EB__
+ rev x7, x7
+#endif
b.gt .L256_dec_blocks_more_than_3
sub w12, w12, #1
@@ -5843,9 +6194,15 @@ aes_gcm_dec_256_kernel:
pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid
eor x6, x6, x13 //AES final-2 block - round 14 low
+#ifdef __AARCH64EB__
+ rev x6, x6
+#endif
pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low
eor x7, x7, x14 //AES final-2 block - round 14 high
+#ifdef __AARCH64EB__
+ rev x7, x7
+#endif
.L256_dec_blocks_more_than_2: //blocks left > 2
rev64 v4.16b, v5.16b //GHASH final-2 block
@@ -5873,9 +6230,15 @@ aes_gcm_dec_256_kernel:
eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high
eor x6, x6, x13 //AES final-1 block - round 14 low
+#ifdef __AARCH64EB__
+ rev x6, x6
+#endif
eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid
eor x7, x7, x14 //AES final-1 block - round 14 high
+#ifdef __AARCH64EB__
+ rev x7, x7
+#endif
.L256_dec_blocks_more_than_1: //blocks left > 1
stp x6, x7, [x2], #16 //AES final-1 block - store result
@@ -5903,13 +6266,18 @@ aes_gcm_dec_256_kernel:
pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid
eor x6, x6, x13 //AES final block - round 14 low
-
+#ifdef __AARCH64EB__
+ rev x6, x6
+#endif
eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low
eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high
eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid
eor x7, x7, x14 //AES final block - round 14 high
+#ifdef __AARCH64EB__
+ rev x7, x7
+#endif
.L256_dec_blocks_less_than_1: //blocks left <= 1
and x1, x1, #127 //bit_length %= 128
@@ -5935,7 +6303,11 @@ aes_gcm_dec_256_kernel:
mov v0.d[1], x10
bic x4, x4, x9 //mask out low existing bytes
+#ifndef __AARCH64EB__
rev w9, w12
+#else
+ mov w9, w12
+#endif
bic x5, x5, x10 //mask out high existing bytes
Index: src/crypto/external/bsd/openssl/lib/libcrypto/arch/aarch64/aesv8-armx.S
diff -u src/crypto/external/bsd/openssl/lib/libcrypto/arch/aarch64/aesv8-armx.S:1.5 src/crypto/external/bsd/openssl/lib/libcrypto/arch/aarch64/aesv8-armx.S:1.6
--- src/crypto/external/bsd/openssl/lib/libcrypto/arch/aarch64/aesv8-armx.S:1.5 Wed May 10 21:31:54 2023
+++ src/crypto/external/bsd/openssl/lib/libcrypto/arch/aarch64/aesv8-armx.S Wed May 31 15:35:31 2023
@@ -1859,10 +1859,10 @@ aes_v8_xts_encrypt:
b.ne .Lxts_enc_big_size
// Encrypt the iv with key2, as the first XEX iv.
ldr w6,[x4,#240]
- ld1 {v0.16b},[x4],#16
+ ld1 {v0.4s},[x4],#16
ld1 {v6.16b},[x5]
sub w6,w6,#2
- ld1 {v1.16b},[x4],#16
+ ld1 {v1.4s},[x4],#16
.Loop_enc_iv_enc:
aese v6.16b,v0.16b
@@ -2462,9 +2462,9 @@ aes_v8_xts_encrypt:
// Encrypt the composite block to get the last second encrypted text block
ldr w6,[x3,#240] // load key schedule...
- ld1 {v0.16b},[x3],#16
+ ld1 {v0.4s},[x3],#16
sub w6,w6,#2
- ld1 {v1.16b},[x3],#16 // load key schedule...
+ ld1 {v1.4s},[x3],#16 // load key schedule...
.Loop_final_enc:
aese v26.16b,v0.16b
aesmc v26.16b,v26.16b
@@ -2500,10 +2500,10 @@ aes_v8_xts_decrypt:
b.ne .Lxts_dec_big_size
// Encrypt the iv with key2, as the first XEX iv.
ldr w6,[x4,#240]
- ld1 {v0.16b},[x4],#16
+ ld1 {v0.4s},[x4],#16
ld1 {v6.16b},[x5]
sub w6,w6,#2
- ld1 {v1.16b},[x4],#16
+ ld1 {v1.4s},[x4],#16
.Loop_dec_small_iv_enc:
aese v6.16b,v0.16b
@@ -2581,10 +2581,10 @@ aes_v8_xts_decrypt:
// Encrypt the iv with key2, as the first XEX iv
ldr w6,[x4,#240]
- ld1 {v0.16b},[x4],#16
+ ld1 {v0.4s},[x4],#16
ld1 {v6.16b},[x5]
sub w6,w6,#2
- ld1 {v1.16b},[x4],#16
+ ld1 {v1.4s},[x4],#16
.Loop_dec_iv_enc:
aese v6.16b,v0.16b
@@ -2914,7 +2914,7 @@ aes_v8_xts_decrypt:
.align 4
.Lxts_dec_tail4x:
add x0,x0,#16
- ld1 {v0.4s},[x0],#16
+ tst x21,#0xf
eor v5.16b,v1.16b,v4.16b
st1 {v5.16b},[x1],#16
eor v17.16b,v24.16b,v17.16b
@@ -2923,6 +2923,8 @@ aes_v8_xts_decrypt:
eor v31.16b,v26.16b,v31.16b
st1 {v30.16b,v31.16b},[x1],#32
+ b.eq .Lxts_dec_abort
+ ld1 {v0.16b},[x0],#16
b .Lxts_done
.align 4
.Lxts_outer_dec_tail:
@@ -3100,7 +3102,7 @@ aes_v8_xts_decrypt:
// Processing the last two blocks with cipher stealing.
mov x7,x3
cbnz x2,.Lxts_dec_1st_done
- ld1 {v0.4s},[x0],#16
+ ld1 {v0.16b},[x0],#16
// Decrypt the last secod block to get the last plain text block
.Lxts_dec_1st_done:
@@ -3145,9 +3147,9 @@ aes_v8_xts_decrypt:
// Decrypt the composite block to get the last second plain text block
ldr w6,[x7,#240]
- ld1 {v0.16b},[x7],#16
+ ld1 {v0.4s},[x7],#16
sub w6,w6,#2
- ld1 {v1.16b},[x7],#16
+ ld1 {v1.4s},[x7],#16
.Loop_final_dec:
aesd v26.16b,v0.16b
aesimc v26.16b,v26.16b
Index: src/crypto/external/bsd/openssl/lib/libcrypto/arch/arm/aes-gcm-armv8_64.S
diff -u src/crypto/external/bsd/openssl/lib/libcrypto/arch/arm/aes-gcm-armv8_64.S:1.1 src/crypto/external/bsd/openssl/lib/libcrypto/arch/arm/aes-gcm-armv8_64.S:1.2
--- src/crypto/external/bsd/openssl/lib/libcrypto/arch/arm/aes-gcm-armv8_64.S:1.1 Tue May 9 13:22:43 2023
+++ src/crypto/external/bsd/openssl/lib/libcrypto/arch/arm/aes-gcm-armv8_64.S Wed May 31 15:35:31 2023
@@ -29,28 +29,36 @@ aes_gcm_enc_128_kernel:
stp d14, d15, [sp, #96]
ldp r10, r11, [r16] @ ctr96_b64, ctr96_t32
+#ifdef __ARMEB__
+ rev r10, r10
+ rev r11, r11
+#endif
ldp r13, r14, [r8, #160] @ load rk10
-
+#ifdef __ARMEB__
+ ror r13, r13, #32
+ ror r14, r14, #32
+#endif
ld1 {v11.16b}, [r3]
ext v11.16b, v11.16b, v11.16b, #8
rev64 v11.16b, v11.16b
lsr r5, r1, #3 @ byte_len
mov r15, r5
- ldr q27, [r8, #144] @ load rk9
+ ld1 {v18.4s}, [r8], #16 @ load rk0
add r4, r0, r1, lsr #3 @ end_input_ptr
sub r5, r5, #1 @ byte_len - 1
lsr r12, r11, #32
ldr q15, [r3, #112] @ load h4l | h4h
+#ifndef __ARMEB__
ext v15.16b, v15.16b, v15.16b, #8
-
+#endif
fmov d1, r10 @ CTR block 1
rev r12, r12 @ rev_ctr32
add r12, r12, #1 @ increment rev_ctr32
orr r11, r11, r11
- ldr q18, [r8, #0] @ load rk0
+ ld1 {v19.4s}, [r8], #16 @ load rk1
rev r9, r12 @ CTR block 1
add r12, r12, #1 @ CTR block 1
@@ -70,30 +78,33 @@ aes_gcm_enc_128_kernel:
rev r9, r12 @ CTR block 3
orr r9, r11, r9, lsl #32 @ CTR block 3
- ldr q19, [r8, #16] @ load rk1
+ ld1 {v20.4s}, [r8], #16 @ load rk2
add r12, r12, #1 @ CTR block 3
fmov v3.d[1], r9 @ CTR block 3
ldr q14, [r3, #80] @ load h3l | h3h
+#ifndef __ARMEB__
ext v14.16b, v14.16b, v14.16b, #8
-
+#endif
aese q1, v18.16b
aesmc q1, q1 @ AES block 1 - round 0
- ldr q20, [r8, #32] @ load rk2
+ ld1 {v21.4s}, [r8], #16 @ load rk3
aese q2, v18.16b
aesmc q2, q2 @ AES block 2 - round 0
ldr q12, [r3, #32] @ load h1l | h1h
+#ifndef __ARMEB__
ext v12.16b, v12.16b, v12.16b, #8
+#endif
aese q0, v18.16b
aesmc q0, q0 @ AES block 0 - round 0
- ldr q26, [r8, #128] @ load rk8
+ ld1 {v22.4s}, [r8], #16 @ load rk4
aese q3, v18.16b
aesmc q3, q3 @ AES block 3 - round 0
- ldr q21, [r8, #48] @ load rk3
+ ld1 {v23.4s}, [r8], #16 @ load rk5
aese q2, v19.16b
aesmc q2, q2 @ AES block 2 - round 1
@@ -101,11 +112,11 @@ aes_gcm_enc_128_kernel:
aese q0, v19.16b
aesmc q0, q0 @ AES block 0 - round 1
- ldr q24, [r8, #96] @ load rk6
+ ld1 {v24.4s}, [r8], #16 @ load rk6
aese q1, v19.16b
aesmc q1, q1 @ AES block 1 - round 1
- ldr q25, [r8, #112] @ load rk7
+ ld1 {v25.4s}, [r8], #16 @ load rk7
aese q3, v19.16b
aesmc q3, q3 @ AES block 3 - round 1
@@ -113,12 +124,14 @@ aes_gcm_enc_128_kernel:
aese q0, v20.16b
aesmc q0, q0 @ AES block 0 - round 2
- ldr q23, [r8, #80] @ load rk5
+ ld1 {v26.4s}, [r8], #16 @ load rk8
aese q1, v20.16b
aesmc q1, q1 @ AES block 1 - round 2
ldr q13, [r3, #64] @ load h2l | h2h
+#ifndef __ARMEB__
ext v13.16b, v13.16b, v13.16b, #8
+#endif
aese q3, v20.16b
aesmc q3, q3 @ AES block 3 - round 2
@@ -135,7 +148,7 @@ aes_gcm_enc_128_kernel:
aese q2, v21.16b
aesmc q2, q2 @ AES block 2 - round 3
- ldr q22, [r8, #64] @ load rk4
+ ld1 {v27.4s}, [r8], #16 @ load rk9
aese q3, v21.16b
aesmc q3, q3 @ AES block 3 - round 3
@@ -218,13 +231,25 @@ aes_gcm_enc_128_kernel:
bge .L128_enc_tail @ handle tail
ldp r6, r7, [r0, #0] @ AES block 0 - load plaintext
-
+#ifdef __ARMEB__
+ rev r6, r6
+ rev r7, r7
+#endif
ldp r21, r22, [r0, #32] @ AES block 2 - load plaintext
-
+#ifdef __ARMEB__
+ rev r21, r21
+ rev r22, r22
+#endif
ldp r19, r20, [r0, #16] @ AES block 1 - load plaintext
-
+#ifdef __ARMEB__
+ rev r19, r19
+ rev r20, r20
+#endif
ldp r23, r24, [r0, #48] @ AES block 3 - load plaintext
-
+#ifdef __ARMEB__
+ rev r23, r23
+ rev r24, r24
+#endif
eor r6, r6, r13 @ AES block 0 - round 10 low
eor r7, r7, r14 @ AES block 0 - round 10 high
@@ -289,6 +314,10 @@ aes_gcm_enc_128_kernel:
.L128_enc_main_loop:@ main loop start
ldp r23, r24, [r0, #48] @ AES block 4k+3 - load plaintext
+#ifdef __ARMEB__
+ rev r23, r23
+ rev r24, r24
+#endif
rev64 q4, q4 @ GHASH block 4k (only t0 is free)
rev64 q6, q6 @ GHASH block 4k+2 (t0, t1, and t2 free)
@@ -323,7 +352,10 @@ aes_gcm_enc_128_kernel:
pmull2 v28.1q, q5, v14.2d @ GHASH block 4k+1 - high
eor v31.8b, v31.8b, q6 @ GHASH block 4k+2 - mid
ldp r6, r7, [r0, #0] @ AES block 4k+4 - load plaintext
-
+#ifdef __ARMEB__
+ rev r6, r6
+ rev r7, r7
+#endif
aese q0, v19.16b
aesmc q0, q0 @ AES block 4k+4 - round 1
rev r9, r12 @ CTR block 4k+8
@@ -405,7 +437,10 @@ aes_gcm_enc_128_kernel:
aese q1, v23.16b
aesmc q1, q1 @ AES block 4k+5 - round 5
ldp r19, r20, [r0, #16] @ AES block 4k+5 - load plaintext
-
+#ifdef __ARMEB__
+ rev r19, r19
+ rev r20, r20
+#endif
aese q3, v21.16b
aesmc q3, q3 @ AES block 4k+7 - round 3
eor v10.16b, v10.16b, v31.16b @ GHASH block 4k+2 - mid
@@ -413,7 +448,10 @@ aes_gcm_enc_128_kernel:
aese q0, v23.16b
aesmc q0, q0 @ AES block 4k+4 - round 5
ldp r21, r22, [r0, #32] @ AES block 4k+6 - load plaintext
-
+#ifdef __ARMEB__
+ rev r21, r21
+ rev r22, r22
+#endif
pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid
eor v11.16b, v11.16b, v29.16b @ GHASH block 4k+3 - low
@@ -722,7 +760,10 @@ aes_gcm_enc_128_kernel:
sub r5, r4, r0 @ main_end_input_ptr is number of bytes left to process
ldp r6, r7, [r0], #16 @ AES block 4k+4 - load plaintext
-
+#ifdef __ARMEB__
+ rev r6, r6
+ rev r7, r7
+#endif
cmp r5, #48
ext q8, v11.16b, v11.16b, #8 @ prepare final partial tag
@@ -760,7 +801,10 @@ aes_gcm_enc_128_kernel:
st1 { q5}, [r2], #16 @ AES final-3 block - store result
ldp r6, r7, [r0], #16 @ AES final-2 block - load input low & high
-
+#ifdef __ARMEB__
+ rev r6, r6
+ rev r7, r7
+#endif
rev64 q4, q5 @ GHASH final-3 block
eor q4, q4, q8 @ feed in partial tag
@@ -789,7 +833,10 @@ aes_gcm_enc_128_kernel:
rev64 q4, q5 @ GHASH final-2 block
ldp r6, r7, [r0], #16 @ AES final-1 block - load input low & high
-
+#ifdef __ARMEB__
+ rev r6, r6
+ rev r7, r7
+#endif
eor q4, q4, q8 @ feed in partial tag
eor r6, r6, r13 @ AES final-1 block - round 10 low
@@ -823,7 +870,10 @@ aes_gcm_enc_128_kernel:
rev64 q4, q5 @ GHASH final-1 block
ldp r6, r7, [r0], #16 @ AES final block - load input low & high
-
+#ifdef __ARMEB__
+ rev r6, r6
+ rev r7, r7
+#endif
eor q4, q4, q8 @ feed in partial tag
eor r7, r7, r14 @ AES final block - round 10 high
@@ -886,9 +936,11 @@ aes_gcm_enc_128_kernel:
ld1 { v18.16b}, [r2] @ load existing bytes where the possibly partial last block is to be stored
eor q8, q8, q4 @ GHASH final block - mid
-
+#ifndef __ARMEB__
rev r9, r12
-
+#else
+ mov r9, r12
+#endif
pmull2 v20.1q, q4, v12.2d @ GHASH final block - high
pmull v8.1q, q8, v16.1d @ GHASH final block - mid
@@ -961,20 +1013,29 @@ aes_gcm_dec_128_kernel:
lsr r5, r1, #3 @ byte_len
mov r15, r5
ldp r10, r11, [r16] @ ctr96_b64, ctr96_t32
-
+#ifdef __ARMEB__
+ rev r10, r10
+ rev r11, r11
+#endif
+ ldp r13, r14, [r8, #160] @ load rk10
+#ifdef __ARMEB__
+ ror r14, r14, 32
+ ror r13, r13, 32
+#endif
sub r5, r5, #1 @ byte_len - 1
- ldr q18, [r8, #0] @ load rk0
+ ld1 {v18.4s}, [r8], #16 @ load rk0
and r5, r5, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
ld1 { q0}, [r16] @ special case vector load initial counter so we can start first AES block as quickly as possible
ldr q13, [r3, #64] @ load h2l | h2h
+#ifndef __ARMEB__
ext v13.16b, v13.16b, v13.16b, #8
-
+#endif
lsr r12, r11, #32
fmov d2, r10 @ CTR block 2
- ldr q19, [r8, #16] @ load rk1
+ ld1 {v19.4s}, [r8], #16 @ load rk1
orr r11, r11, r11
rev r12, r12 @ rev_ctr32
@@ -986,7 +1047,7 @@ aes_gcm_dec_128_kernel:
rev r9, r12 @ CTR block 1
orr r9, r11, r9, lsl #32 @ CTR block 1
- ldr q20, [r8, #32] @ load rk2
+ ld1 {v20.4s}, [r8], #16 @ load rk2
add r12, r12, #1 @ CTR block 1
fmov v1.d[1], r9 @ CTR block 1
@@ -1009,19 +1070,19 @@ aes_gcm_dec_128_kernel:
aese q1, v18.16b
aesmc q1, q1 @ AES block 1 - round 0
- ldr q21, [r8, #48] @ load rk3
+ ld1 {v21.4s}, [r8], #16 @ load rk3
aese q0, v20.16b
aesmc q0, q0 @ AES block 0 - round 2
- ldr q24, [r8, #96] @ load rk6
+ ld1 {v22.4s}, [r8], #16 @ load rk4
aese q2, v18.16b
aesmc q2, q2 @ AES block 2 - round 0
- ldr q25, [r8, #112] @ load rk7
+ ld1 {v23.4s}, [r8], #16 @ load rk5
aese q1, v19.16b
aesmc q1, q1 @ AES block 1 - round 1
- ldr q22, [r8, #64] @ load rk4
+ ld1 {v24.4s}, [r8], #16 @ load rk6
aese q3, v18.16b
aesmc q3, q3 @ AES block 3 - round 0
@@ -1031,7 +1092,6 @@ aes_gcm_dec_128_kernel:
aese q1, v20.16b
aesmc q1, q1 @ AES block 1 - round 2
- ldp r13, r14, [r8, #160] @ load rk10
aese q3, v19.16b
aesmc q3, q3 @ AES block 3 - round 1
@@ -1041,7 +1101,7 @@ aes_gcm_dec_128_kernel:
aese q0, v21.16b
aesmc q0, q0 @ AES block 0 - round 3
- ldr q23, [r8, #80] @ load rk5
+ ld1 {v25.4s}, [r8], #16 @ load rk7
aese q1, v21.16b
aesmc q1, q1 @ AES block 1 - round 3
@@ -1051,7 +1111,7 @@ aes_gcm_dec_128_kernel:
aese q2, v20.16b
aesmc q2, q2 @ AES block 2 - round 2
- ldr q27, [r8, #144] @ load rk9
+ ld1 {v26.4s}, [r8], #16 @ load rk8
aese q1, v22.16b
aesmc q1, q1 @ AES block 1 - round 4
@@ -1062,11 +1122,12 @@ aes_gcm_dec_128_kernel:
aese q2, v21.16b
aesmc q2, q2 @ AES block 2 - round 3
ldr q14, [r3, #80] @ load h3l | h3h
+#ifndef __ARMEB__
ext v14.16b, v14.16b, v14.16b, #8
-
+#endif
aese q0, v22.16b
aesmc q0, q0 @ AES block 0 - round 4
- ldr q26, [r8, #128] @ load rk8
+ ld1 {v27.4s}, [r8], #16 @ load rk9
aese q1, v23.16b
aesmc q1, q1 @ AES block 1 - round 5
@@ -1083,8 +1144,9 @@ aes_gcm_dec_128_kernel:
aese q2, v23.16b
aesmc q2, q2 @ AES block 2 - round 5
ldr q12, [r3, #32] @ load h1l | h1h
+#ifndef __ARMEB__
ext v12.16b, v12.16b, v12.16b, #8
-
+#endif
aese q3, v23.16b
aesmc q3, q3 @ AES block 3 - round 5
@@ -1102,7 +1164,9 @@ aes_gcm_dec_128_kernel:
trn1 q8, v12.2d, v13.2d @ h2h | h1h
ldr q15, [r3, #112] @ load h4l | h4h
+#ifndef __ARMEB__
ext v15.16b, v15.16b, v15.16b, #8
+#endif
trn2 v16.2d, v12.2d, v13.2d @ h2l | h1l
add r5, r5, r0
@@ -1144,12 +1208,10 @@ aes_gcm_dec_128_kernel:
eor v17.16b, v17.16b, q9 @ h4k | h3k
bge .L128_dec_tail @ handle tail
- ldr q5, [r0, #16] @ AES block 1 - load ciphertext
-
- ldr q4, [r0, #0] @ AES block 0 - load ciphertext
+ ld1 {q4, q5}, [r0], #32 @ AES block 0 - load ciphertext; AES block 1 - load ciphertext
eor q1, q5, q1 @ AES block 1 - result
- ldr q6, [r0, #32] @ AES block 2 - load ciphertext
+ ld1 {q6}, [r0], #16 @ AES block 2 - load ciphertext
eor q0, q4, q0 @ AES block 0 - result
rev64 q4, q4 @ GHASH block 0
@@ -1157,10 +1219,9 @@ aes_gcm_dec_128_kernel:
orr r9, r11, r9, lsl #32 @ CTR block 4
add r12, r12, #1 @ CTR block 4
- ldr q7, [r0, #48] @ AES block 3 - load ciphertext
+ ld1 {q7}, [r0], #16 @ AES block 3 - load ciphertext
rev64 q5, q5 @ GHASH block 1
- add r0, r0, #64 @ AES input_ptr update
mov r19, v1.d[0] @ AES block 1 - mov low
mov r20, v1.d[1] @ AES block 1 - mov high
@@ -1175,7 +1236,9 @@ aes_gcm_dec_128_kernel:
fmov v0.d[1], r9 @ CTR block 4
rev r9, r12 @ CTR block 5
eor r19, r19, r13 @ AES block 1 - round 10 low
-
+#ifdef __ARMEB__
+ rev r19, r19
+#endif
fmov d1, r10 @ CTR block 5
add r12, r12, #1 @ CTR block 5
orr r9, r11, r9, lsl #32 @ CTR block 5
@@ -1187,10 +1250,19 @@ aes_gcm_dec_128_kernel:
orr r9, r11, r9, lsl #32 @ CTR block 6
eor r20, r20, r14 @ AES block 1 - round 10 high
+#ifdef __ARMEB__
+ rev r20, r20
+#endif
eor r6, r6, r13 @ AES block 0 - round 10 low
+#ifdef __ARMEB__
+ rev r6, r6
+#endif
eor q2, q6, q2 @ AES block 2 - result
eor r7, r7, r14 @ AES block 0 - round 10 high
+#ifdef __ARMEB__
+ rev r7, r7
+#endif
stp r6, r7, [r2], #16 @ AES block 0 - store result
stp r19, r20, [r2], #16 @ AES block 1 - store result
@@ -1258,9 +1330,14 @@ aes_gcm_dec_128_kernel:
aese q3, v19.16b
aesmc q3, q3 @ AES block 4k+7 - round 1
eor r23, r23, r13 @ AES block 4k+3 - round 10 low
-
+#ifdef __ARMEB__
+ rev r23, r23
+#endif
pmull v30.1q, v30.1d, v17.1d @ GHASH block 4k+1 - mid
eor r22, r22, r14 @ AES block 4k+2 - round 10 high
+#ifdef __ARMEB__
+ rev r22, r22
+#endif
mov d31, v6.d[1] @ GHASH block 4k+2 - mid
aese q0, v19.16b
@@ -1298,7 +1375,9 @@ aes_gcm_dec_128_kernel:
pmull2 v31.1q, v31.2d, v16.2d @ GHASH block 4k+2 - mid
eor r24, r24, r14 @ AES block 4k+3 - round 10 high
-
+#ifdef __ARMEB__
+ rev r24, r24
+#endif
aese q2, v20.16b
aesmc q2, q2 @ AES block 4k+6 - round 2
eor v30.8b, v30.8b, q7 @ GHASH block 4k+3 - mid
@@ -1306,7 +1385,9 @@ aes_gcm_dec_128_kernel:
aese q1, v23.16b
aesmc q1, q1 @ AES block 4k+5 - round 5
eor r21, r21, r13 @ AES block 4k+2 - round 10 low
-
+#ifdef __ARMEB__
+ rev r21, r21
+#endif
aese q0, v23.16b
aesmc q0, q0 @ AES block 4k+4 - round 5
movi q8, #0xc2
@@ -1328,7 +1409,7 @@ aes_gcm_dec_128_kernel:
pmull v30.1q, v30.1d, v16.1d @ GHASH block 4k+3 - mid
eor q9, q9, q4 @ GHASH block 4k+3 - high
- ldr q4, [r0, #0] @ AES block 4k+4 - load ciphertext
+ ld1 {q4}, [r0], #16 @ AES block 4k+3 - load ciphertext
aese q1, v25.16b
aesmc q1, q1 @ AES block 4k+5 - round 7
@@ -1355,7 +1436,7 @@ aes_gcm_dec_128_kernel:
rev r9, r12 @ CTR block 4k+8
pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid
- ldr q5, [r0, #16] @ AES block 4k+5 - load ciphertext
+ ld1 {q5}, [r0], #16 @ AES block 4k+4 - load ciphertext
ext q9, q9, q9, #8 @ MODULO - other top alignment
aese q0, v27.16b @ AES block 4k+4 - round 9
@@ -1373,7 +1454,7 @@ aes_gcm_dec_128_kernel:
aese q3, v23.16b
aesmc q3, q3 @ AES block 4k+7 - round 5
- ldr q6, [r0, #32] @ AES block 4k+6 - load ciphertext
+ ld1 {q6}, [r0], #16 @ AES block 4k+5 - load ciphertext
add r12, r12, #1 @ CTR block 4k+8
eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid
@@ -1381,11 +1462,10 @@ aes_gcm_dec_128_kernel:
aese q2, v25.16b
aesmc q2, q2 @ AES block 4k+6 - round 7
- ldr q7, [r0, #48] @ AES block 4k+3 - load ciphertext
+ ld1 {q7}, [r0], #16 @ AES block 4k+6 - load ciphertext
aese q3, v24.16b
aesmc q3, q3 @ AES block 4k+7 - round 6
- add r0, r0, #64 @ AES input_ptr update
rev64 q5, q5 @ GHASH block 4k+5
eor v10.16b, v10.16b, q9 @ MODULO - fold into mid
@@ -1410,11 +1490,15 @@ aes_gcm_dec_128_kernel:
aese q3, v26.16b
aesmc q3, q3 @ AES block 4k+7 - round 8
eor r7, r7, r14 @ AES block 4k+4 - round 10 high
-
+#ifdef __ARMEB__
+ rev r7, r7
+#endif
eor v11.16b, v11.16b, q8 @ MODULO - fold into low
mov r20, v1.d[1] @ AES block 4k+5 - mov high
eor r6, r6, r13 @ AES block 4k+4 - round 10 low
-
+#ifdef __ARMEB__
+ rev r6, r6
+#endif
eor q2, q6, q2 @ AES block 4k+6 - result
mov r19, v1.d[0] @ AES block 4k+5 - mov low
add r12, r12, #1 @ CTR block 4k+9
@@ -1431,9 +1515,15 @@ aes_gcm_dec_128_kernel:
add r12, r12, #1 @ CTR block 4k+10
eor r20, r20, r14 @ AES block 4k+5 - round 10 high
+#ifdef __ARMEB__
+ rev r20, r20
+#endif
stp r6, r7, [r2], #16 @ AES block 4k+4 - store result
eor r19, r19, r13 @ AES block 4k+5 - round 10 low
+#ifdef __ARMEB__
+ rev r19, r19
+#endif
stp r19, r20, [r2], #16 @ AES block 4k+5 - store result
orr r9, r11, r9, lsl #32 @ CTR block 4k+10
@@ -1538,9 +1628,14 @@ aes_gcm_dec_128_kernel:
aese q3, v20.16b
aesmc q3, q3 @ AES block 4k+7 - round 2
eor r23, r23, r13 @ AES block 4k+3 - round 10 low
-
+#ifdef __ARMEB__
+ rev r23, r23
+#endif
pmull v30.1q, v30.1d, v16.1d @ GHASH block 4k+3 - mid
eor r21, r21, r13 @ AES block 4k+2 - round 10 low
+#ifdef __ARMEB__
+ rev r21, r21
+#endif
eor v11.16b, v11.16b, v29.16b @ GHASH block 4k+3 - low
aese q2, v21.16b
@@ -1613,7 +1708,9 @@ aes_gcm_dec_128_kernel:
pmull v8.1q, v10.1d, q8 @ MODULO - mid 64b align with low
eor r24, r24, r14 @ AES block 4k+3 - round 10 high
-
+#ifdef __ARMEB__
+ rev r24, r24
+#endif
aese q2, v25.16b
aesmc q2, q2 @ AES block 4k+6 - round 7
ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment
@@ -1631,7 +1728,9 @@ aes_gcm_dec_128_kernel:
aese q3, v26.16b
aesmc q3, q3 @ AES block 4k+7 - round 8
eor r22, r22, r14 @ AES block 4k+2 - round 10 high
-
+#ifdef __ARMEB__
+ rev r22, r22
+#endif
aese q0, v27.16b @ AES block 4k+4 - round 9
stp r21, r22, [r2], #16 @ AES block 4k+2 - store result
@@ -1655,9 +1754,14 @@ aes_gcm_dec_128_kernel:
cmp r5, #48
eor r7, r7, r14 @ AES block 4k+4 - round 10 high
-
+#ifdef __ARMEB__
+ rev r7, r7
+#endif
ext q8, v11.16b, v11.16b, #8 @ prepare final partial tag
eor r6, r6, r13 @ AES block 4k+4 - round 10 low
+#ifdef __ARMEB__
+ rev r6, r6
+#endif
bgt .L128_dec_blocks_more_than_3
mov q3, q2
@@ -1701,9 +1805,14 @@ aes_gcm_dec_128_kernel:
movi q8, #0 @ suppress further partial tag feed in
eor r7, r7, r14 @ AES final-2 block - round 10 high
-
+#ifdef __ARMEB__
+ rev r7, r7
+#endif
pmull v10.1q, v22.1d, v10.1d @ GHASH final-3 block - mid
eor r6, r6, r13 @ AES final-2 block - round 10 low
+#ifdef __ARMEB__
+ rev r6, r6
+#endif
.L128_dec_blocks_more_than_2:@ blocks left > 2
rev64 q4, q5 @ GHASH final-2 block
@@ -1729,12 +1838,18 @@ aes_gcm_dec_128_kernel:
pmull v22.1q, v22.1d, v17.1d @ GHASH final-2 block - mid
eor r6, r6, r13 @ AES final-1 block - round 10 low
+#ifdef __ARMEB__
+ rev r6, r6
+#endif
eor v11.16b, v11.16b, v21.16b @ GHASH final-2 block - low
eor q9, q9, v20.16b @ GHASH final-2 block - high
eor v10.16b, v10.16b, v22.16b @ GHASH final-2 block - mid
eor r7, r7, r14 @ AES final-1 block - round 10 high
+#ifdef __ARMEB__
+ rev r7, r7
+#endif
.L128_dec_blocks_more_than_1:@ blocks left > 1
rev64 q4, q5 @ GHASH final-1 block
@@ -1765,8 +1880,13 @@ aes_gcm_dec_128_kernel:
eor q9, q9, v20.16b @ GHASH final-1 block - high
eor r7, r7, r14 @ AES final block - round 10 high
-
+#ifdef __ARMEB__
+ rev r7, r7
+#endif
eor r6, r6, r13 @ AES final block - round 10 low
+#ifdef __ARMEB__
+ rev r6, r6
+#endif
eor v10.16b, v10.16b, v22.16b @ GHASH final-1 block - mid
.L128_dec_blocks_less_than_1:@ blocks left <= 1
@@ -1812,7 +1932,11 @@ aes_gcm_dec_128_kernel:
bic r4, r4, r9 @ mask out low existing bytes
and r6, r6, r9
+#ifndef __ARMEB__
rev r9, r12
+#else
+ mov r9, r12
+#endif
eor v10.16b, v10.16b, q8 @ GHASH final block - mid
movi q8, #0xc2
@@ -1879,18 +2003,26 @@ aes_gcm_enc_192_kernel:
stp d14, d15, [sp, #96]
ldp r10, r11, [r16] @ ctr96_b64, ctr96_t32
+#ifdef __ARMEB__
+ rev r10, r10
+ rev r11, r11
+#endif
+ ldp r13, r14, [r8, #192] @ load rk12
+#ifdef __ARMEB__
+ ror r13, r13, #32
+ ror r14, r14, #32
+#endif
+ ld1 {v18.4s}, [r8], #16 @ load rk0
- ldr q23, [r8, #80] @ load rk5
-
- ldr q22, [r8, #64] @ load rk4
+ ld1 {v19.4s}, [r8], #16 @ load rk1
- ldr q26, [r8, #128] @ load rk8
+ ld1 {v20.4s}, [r8], #16 @ load rk2
lsr r12, r11, #32
- ldr q24, [r8, #96] @ load rk6
+ ld1 {v21.4s}, [r8], #16 @ load rk3
orr r11, r11, r11
- ldr q25, [r8, #112] @ load rk7
+ ld1 {v22.4s}, [r8], #16 @ load rk4
rev r12, r12 @ rev_ctr32
add r12, r12, #1 @ increment rev_ctr32
@@ -1914,15 +2046,13 @@ aes_gcm_enc_192_kernel:
rev r9, r12 @ CTR block 3
orr r9, r11, r9, lsl #32 @ CTR block 3
- ldr q18, [r8, #0] @ load rk0
+ ld1 {v23.4s}, [r8], #16 @ load rk5
fmov v3.d[1], r9 @ CTR block 3
- ldr q21, [r8, #48] @ load rk3
+ ld1 {v24.4s}, [r8], #16 @ load rk6
- ldp r13, r14, [r8, #192] @ load rk12
-
- ldr q19, [r8, #16] @ load rk1
+ ld1 {v25.4s}, [r8], #16 @ load rk7
aese q0, v18.16b
aesmc q0, q0 @ AES block 0 - round 0
@@ -1932,35 +2062,38 @@ aes_gcm_enc_192_kernel:
aese q3, v18.16b
aesmc q3, q3 @ AES block 3 - round 0
- ldr q29, [r8, #176] @ load rk11
+ ld1 {v26.4s}, [r8], #16 @ load rk8
aese q1, v18.16b
aesmc q1, q1 @ AES block 1 - round 0
ldr q15, [r3, #112] @ load h4l | h4h
+#ifndef __ARMEB__
ext v15.16b, v15.16b, v15.16b, #8
-
+#endif
aese q2, v18.16b
aesmc q2, q2 @ AES block 2 - round 0
- ldr q20, [r8, #32] @ load rk2
+ ld1 {v27.4s}, [r8], #16 @ load rk9
aese q0, v19.16b
aesmc q0, q0 @ AES block 0 - round 1
- ldr q28, [r8, #160] @ load rk10
+ ld1 {v28.4s}, [r8], #16 @ load rk10
aese q1, v19.16b
aesmc q1, q1 @ AES block 1 - round 1
ldr q12, [r3, #32] @ load h1l | h1h
+#ifndef __ARMEB__
ext v12.16b, v12.16b, v12.16b, #8
-
+#endif
aese q2, v19.16b
aesmc q2, q2 @ AES block 2 - round 1
- ldr q27, [r8, #144] @ load rk9
+ ld1 {v29.4s}, [r8], #16 @ load rk11
aese q3, v19.16b
aesmc q3, q3 @ AES block 3 - round 1
ldr q14, [r3, #80] @ load h3l | h3h
+#ifndef __ARMEB__
ext v14.16b, v14.16b, v14.16b, #8
-
+#endif
aese q0, v20.16b
aesmc q0, q0 @ AES block 0 - round 2
@@ -2017,8 +2150,9 @@ aes_gcm_enc_192_kernel:
aese q2, v24.16b
aesmc q2, q2 @ AES block 2 - round 6
ldr q13, [r3, #64] @ load h2l | h2h
+#ifndef __ARMEB__
ext v13.16b, v13.16b, v13.16b, #8
-
+#endif
aese q1, v24.16b
aesmc q1, q1 @ AES block 1 - round 6
@@ -2098,13 +2232,26 @@ aes_gcm_enc_192_kernel:
rev r9, r12 @ CTR block 4
ldp r6, r7, [r0, #0] @ AES block 0 - load plaintext
-
+#ifdef __ARMEB__
+ rev r6, r6
+ rev r7, r7
+#endif
orr r9, r11, r9, lsl #32 @ CTR block 4
ldp r21, r22, [r0, #32] @ AES block 2 - load plaintext
-
+#ifdef __ARMEB__
+ rev r21, r21
+ rev r22, r22
+#endif
ldp r23, r24, [r0, #48] @ AES block 3 - load plaintext
-
+#ifdef __ARMEB__
+ rev r23, r23
+ rev r24, r24
+#endif
ldp r19, r20, [r0, #16] @ AES block 1 - load plaintext
+#ifdef __ARMEB__
+ rev r19, r19
+ rev r20, r20
+#endif
add r0, r0, #64 @ AES input_ptr update
cmp r0, r5 @ check if we have <= 8 blocks
@@ -2176,7 +2323,10 @@ aes_gcm_enc_192_kernel:
aese q1, v18.16b
aesmc q1, q1 @ AES block 4k+5 - round 0
ldp r19, r20, [r0, #16] @ AES block 4k+5 - load plaintext
-
+#ifdef __ARMEB__
+ rev r19, r19
+ rev r20, r20
+#endif
ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0
fmov d3, r10 @ CTR block 4k+3
rev64 q4, q4 @ GHASH block 4k (only t0 is free)
@@ -2188,11 +2338,17 @@ aes_gcm_enc_192_kernel:
pmull2 v30.1q, q5, v14.2d @ GHASH block 4k+1 - high
rev64 q7, q7 @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
ldp r21, r22, [r0, #32] @ AES block 4k+6 - load plaintext
-
+#ifdef __ARMEB__
+ rev r21, r21
+ rev r22, r22
+#endif
aese q0, v18.16b
aesmc q0, q0 @ AES block 4k+4 - round 0
ldp r23, r24, [r0, #48] @ AES block 4k+3 - load plaintext
-
+#ifdef __ARMEB__
+ rev r23, r23
+ rev r24, r24
+#endif
pmull v31.1q, q5, v14.1d @ GHASH block 4k+1 - low
eor q4, q4, v11.16b @ PRE 1
@@ -2285,7 +2441,10 @@ aes_gcm_enc_192_kernel:
aese q1, v22.16b
aesmc q1, q1 @ AES block 4k+5 - round 4
ldp r6, r7, [r0, #0] @ AES block 4k+4 - load plaintext
-
+#ifdef __ARMEB__
+ rev r6, r6
+ rev r7, r7
+#endif
aese q0, v24.16b
aesmc q0, q0 @ AES block 4k+4 - round 6
eor v11.16b, v11.16b, q8 @ GHASH block 4k+2 - low
@@ -2650,7 +2809,10 @@ aes_gcm_enc_192_kernel:
sub r5, r4, r0 @ main_end_input_ptr is number of bytes left to process
ldp r6, r7, [r0], #16 @ AES block 4k+4 - load plaintext
-
+#ifdef __ARMEB__
+ rev r6, r6
+ rev r7, r7
+#endif
eor r6, r6, r13 @ AES block 4k+4 - round 12 low
eor r7, r7, r14 @ AES block 4k+4 - round 12 high
@@ -2687,7 +2849,10 @@ aes_gcm_enc_192_kernel:
st1 { q5}, [r2], #16 @ AES final-3 block - store result
ldp r6, r7, [r0], #16 @ AES final-2 block - load input low & high
-
+#ifdef __ARMEB__
+ rev r6, r6
+ rev r7, r7
+#endif
rev64 q4, q5 @ GHASH final-3 block
eor r6, r6, r13 @ AES final-2 block - round 12 low
@@ -2718,7 +2883,10 @@ aes_gcm_enc_192_kernel:
rev64 q4, q5 @ GHASH final-2 block
ldp r6, r7, [r0], #16 @ AES final-1 block - load input low & high
-
+#ifdef __ARMEB__
+ rev r6, r6
+ rev r7, r7
+#endif
eor q4, q4, q8 @ feed in partial tag
eor r7, r7, r14 @ AES final-1 block - round 12 high
@@ -2749,7 +2917,10 @@ aes_gcm_enc_192_kernel:
st1 { q5}, [r2], #16 @ AES final-1 block - store result
ldp r6, r7, [r0], #16 @ AES final block - load input low & high
-
+#ifdef __ARMEB__
+ rev r6, r6
+ rev r7, r7
+#endif
rev64 q4, q5 @ GHASH final-1 block
eor r6, r6, r13 @ AES final block - round 12 low
@@ -2781,7 +2952,11 @@ aes_gcm_enc_192_kernel:
.L192_enc_blocks_less_than_1:@ blocks left <= 1
ld1 { v18.16b}, [r2] @ load existing bytes where the possibly partial last block is to be stored
+#ifndef __ARMEB__
rev r9, r12
+#else
+ mov r9, r12
+#endif
and r1, r1, #127 @ bit_length %= 128
sub r1, r1, #128 @ bit_length -= 128
@@ -2886,14 +3061,22 @@ aes_gcm_dec_192_kernel:
add r4, r0, r1, lsr #3 @ end_input_ptr
ldp r10, r11, [r16] @ ctr96_b64, ctr96_t32
-
+#ifdef __ARMEB__
+ rev r10, r10
+ rev r11, r11
+#endif
+ ldp r13, r14, [r8, #192] @ load rk12
+#ifdef __ARMEB__
+ ror r13, r13, #32
+ ror r14, r14, #32
+#endif
ld1 { q0}, [r16] @ special case vector load initial counter so we can start first AES block as quickly as possible
- ldr q18, [r8, #0] @ load rk0
+ ld1 {v18.4s}, [r8], #16 @ load rk0
lsr r5, r1, #3 @ byte_len
mov r15, r5
- ldr q20, [r8, #32] @ load rk2
+ ld1 {v19.4s}, [r8], #16 @ load rk1
lsr r12, r11, #32
orr r11, r11, r11
@@ -2903,7 +3086,7 @@ aes_gcm_dec_192_kernel:
fmov d1, r10 @ CTR block 1
add r12, r12, #1 @ increment rev_ctr32
- ldr q19, [r8, #16] @ load rk1
+ ld1 {v20.4s}, [r8], #16 @ load rk2
aese q0, v18.16b
aesmc q0, q0 @ AES block 0 - round 0
@@ -2911,7 +3094,7 @@ aes_gcm_dec_192_kernel:
add r12, r12, #1 @ CTR block 1
orr r9, r11, r9, lsl #32 @ CTR block 1
- ldr q21, [r8, #48] @ load rk3
+ ld1 {v21.4s}, [r8], #16 @ load rk3
fmov v1.d[1], r9 @ CTR block 1
rev r9, r12 @ CTR block 2
@@ -2929,54 +3112,57 @@ aes_gcm_dec_192_kernel:
fmov v3.d[1], r9 @ CTR block 3
- ldr q26, [r8, #128] @ load rk8
+ ld1 {v22.4s}, [r8], #16 @ load rk4
aese q0, v20.16b
aesmc q0, q0 @ AES block 0 - round 2
aese q2, v18.16b
aesmc q2, q2 @ AES block 2 - round 0
- ldr q29, [r8, #176] @ load rk11
+ ld1 {v23.4s}, [r8], #16 @ load rk5
aese q1, v18.16b
aesmc q1, q1 @ AES block 1 - round 0
ldr q15, [r3, #112] @ load h4l | h4h
+#ifndef __ARMEB__
ext v15.16b, v15.16b, v15.16b, #8
-
+#endif
aese q3, v18.16b
aesmc q3, q3 @ AES block 3 - round 0
ldr q13, [r3, #64] @ load h2l | h2h
+#ifndef __ARMEB__
ext v13.16b, v13.16b, v13.16b, #8
-
+#endif
aese q2, v19.16b
aesmc q2, q2 @ AES block 2 - round 1
ldr q14, [r3, #80] @ load h3l | h3h
+#ifndef __ARMEB__
ext v14.16b, v14.16b, v14.16b, #8
-
+#endif
aese q1, v19.16b
aesmc q1, q1 @ AES block 1 - round 1
- ldp r13, r14, [r8, #192] @ load rk12
aese q3, v19.16b
aesmc q3, q3 @ AES block 3 - round 1
ldr q12, [r3, #32] @ load h1l | h1h
+#ifndef __ARMEB__
ext v12.16b, v12.16b, v12.16b, #8
-
+#endif
aese q2, v20.16b
aesmc q2, q2 @ AES block 2 - round 2
- ldr q28, [r8, #160] @ load rk10
+ ld1 {v24.4s}, [r8], #16 @ load rk6
aese q0, v21.16b
aesmc q0, q0 @ AES block 0 - round 3
- ldr q27, [r8, #144] @ load rk9
+ ld1 {v25.4s}, [r8], #16 @ load rk7
aese q1, v20.16b
aesmc q1, q1 @ AES block 1 - round 2
- ldr q25, [r8, #112] @ load rk7
+ ld1 {v26.4s}, [r8], #16 @ load rk8
aese q3, v20.16b
aesmc q3, q3 @ AES block 3 - round 2
- ldr q22, [r8, #64] @ load rk4
+ ld1 {v27.4s}, [r8], #16 @ load rk9
aese q2, v21.16b
aesmc q2, q2 @ AES block 2 - round 3
@@ -2994,7 +3180,7 @@ aes_gcm_dec_192_kernel:
aese q0, v22.16b
aesmc q0, q0 @ AES block 0 - round 4
- ldr q23, [r8, #80] @ load rk5
+ ld1 {v28.4s}, [r8], #16 @ load rk10
aese q1, v22.16b
aesmc q1, q1 @ AES block 1 - round 4
@@ -3009,7 +3195,7 @@ aes_gcm_dec_192_kernel:
aese q0, v23.16b
aesmc q0, q0 @ AES block 0 - round 5
- ldr q24, [r8, #96] @ load rk6
+ ld1 {v29.4s}, [r8], #16 @ load rk11
aese q1, v23.16b
aesmc q1, q1 @ AES block 1 - round 5
@@ -3096,17 +3282,13 @@ aes_gcm_dec_192_kernel:
aese q0, v29.16b @ AES block 0 - round 11
bge .L192_dec_tail @ handle tail
- ldr q5, [r0, #16] @ AES block 1 - load ciphertext
-
- ldr q4, [r0, #0] @ AES block 0 - load ciphertext
+ ld1 {q4, q5}, [r0], #32 @ AES block 0,1 - load ciphertext
eor q1, q5, q1 @ AES block 1 - result
eor q0, q4, q0 @ AES block 0 - result
rev r9, r12 @ CTR block 4
- ldr q7, [r0, #48] @ AES block 3 - load ciphertext
-
- ldr q6, [r0, #32] @ AES block 2 - load ciphertext
+ ld1 {q6, q7}, [r0], #32 @ AES block 2,3 - load ciphertext
mov r19, v1.d[0] @ AES block 1 - mov low
@@ -3118,27 +3300,35 @@ aes_gcm_dec_192_kernel:
mov r7, v0.d[1] @ AES block 0 - mov high
rev64 q4, q4 @ GHASH block 0
- add r0, r0, #64 @ AES input_ptr update
fmov d0, r10 @ CTR block 4
rev64 q5, q5 @ GHASH block 1
cmp r0, r5 @ check if we have <= 8 blocks
eor r19, r19, r13 @ AES block 1 - round 12 low
+#ifdef __ARMEB__
+ rev r19, r19
+#endif
fmov v0.d[1], r9 @ CTR block 4
rev r9, r12 @ CTR block 5
orr r9, r11, r9, lsl #32 @ CTR block 5
fmov d1, r10 @ CTR block 5
eor r20, r20, r14 @ AES block 1 - round 12 high
-
+#ifdef __ARMEB__
+ rev r20, r20
+#endif
add r12, r12, #1 @ CTR block 5
fmov v1.d[1], r9 @ CTR block 5
eor r6, r6, r13 @ AES block 0 - round 12 low
-
+#ifdef __ARMEB__
+ rev r6, r6
+#endif
rev r9, r12 @ CTR block 6
eor r7, r7, r14 @ AES block 0 - round 12 high
-
+#ifdef __ARMEB__
+ rev r7, r7
+#endif
stp r6, r7, [r2], #16 @ AES block 0 - store result
orr r9, r11, r9, lsl #32 @ CTR block 6
@@ -3201,7 +3391,9 @@ aes_gcm_dec_192_kernel:
aese q0, v20.16b
aesmc q0, q0 @ AES block 4k+4 - round 2
eor r22, r22, r14 @ AES block 4k+2 - round 12 high
-
+#ifdef __ARMEB__
+ rev r22, r22
+#endif
aese q2, v19.16b
aesmc q2, q2 @ AES block 4k+6 - round 1
eor q4, q4, q5 @ GHASH block 4k+1 - mid
@@ -3218,7 +3410,9 @@ aes_gcm_dec_192_kernel:
pmull v4.1q, q4, v17.1d @ GHASH block 4k+1 - mid
eor v11.16b, v11.16b, v31.16b @ GHASH block 4k+1 - low
eor r21, r21, r13 @ AES block 4k+2 - round 12 low
-
+#ifdef __ARMEB__
+ rev r21, r21
+#endif
aese q1, v22.16b
aesmc q1, q1 @ AES block 4k+5 - round 4
@@ -3320,16 +3514,18 @@ aes_gcm_dec_192_kernel:
aese q2, v24.16b
aesmc q2, q2 @ AES block 4k+6 - round 6
- ldr q6, [r0, #32] @ AES block 4k+6 - load ciphertext
+ ld1 {q4}, [r0], #16 @ AES block 4k+4 - load ciphertext
aese q3, v24.16b
aesmc q3, q3 @ AES block 4k+7 - round 6
eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up
pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid
- ldr q7, [r0, #48] @ AES block 4k+7 - load ciphertext
+ ld1 {q5}, [r0], #16 @ AES block 4k+5 - load ciphertext
eor r23, r23, r13 @ AES block 4k+3 - round 12 low
-
+#ifdef __ARMEB__
+ rev r23, r23
+#endif
aese q2, v25.16b
aesmc q2, q2 @ AES block 4k+6 - round 7
ext q9, q9, q9, #8 @ MODULO - other top alignment
@@ -3343,10 +3539,10 @@ aes_gcm_dec_192_kernel:
aese q2, v26.16b
aesmc q2, q2 @ AES block 4k+6 - round 8
- ldr q4, [r0, #0] @ AES block 4k+4 - load ciphertext
+ ld1 {q6}, [r0], #16 @ AES block 4k+6 - load ciphertext
aese q1, v29.16b @ AES block 4k+5 - round 11
- ldr q5, [r0, #16] @ AES block 4k+5 - load ciphertext
+ ld1 {q7}, [r0], #16 @ AES block 4k+7 - load ciphertext
rev r9, r12 @ CTR block 4k+8
aese q3, v26.16b
@@ -3357,11 +3553,13 @@ aes_gcm_dec_192_kernel:
aesmc q2, q2 @ AES block 4k+6 - round 9
eor v10.16b, v10.16b, q9 @ MODULO - fold into mid
- add r0, r0, #64 @ AES input_ptr update
cmp r0, r5 @ .LOOP CONTROL
eor q0, q4, q0 @ AES block 4k+4 - result
eor r24, r24, r14 @ AES block 4k+3 - round 12 high
+#ifdef __ARMEB__
+ rev r24, r24
+#endif
eor q1, q5, q1 @ AES block 4k+5 - result
aese q2, v28.16b
@@ -3394,18 +3592,28 @@ aes_gcm_dec_192_kernel:
rev r9, r12 @ CTR block 4k+9
eor r6, r6, r13 @ AES block 4k+4 - round 12 low
+#ifdef __ARMEB__
+ rev r6, r6
+#endif
orr r9, r11, r9, lsl #32 @ CTR block 4k+9
eor v11.16b, v11.16b, q8 @ MODULO - fold into low
fmov d1, r10 @ CTR block 4k+9
add r12, r12, #1 @ CTR block 4k+9
eor r19, r19, r13 @ AES block 4k+5 - round 12 low
-
+#ifdef __ARMEB__
+ rev r19, r19
+#endif
fmov v1.d[1], r9 @ CTR block 4k+9
rev r9, r12 @ CTR block 4k+10
eor r20, r20, r14 @ AES block 4k+5 - round 12 high
-
+#ifdef __ARMEB__
+ rev r20, r20
+#endif
eor r7, r7, r14 @ AES block 4k+4 - round 12 high
+#ifdef __ARMEB__
+ rev r7, r7
+#endif
stp r6, r7, [r2], #16 @ AES block 4k+4 - store result
eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low
@@ -3459,18 +3667,29 @@ aes_gcm_dec_192_kernel:
pmull v31.1q, q5, v14.1d @ GHASH block 4k+1 - low
eor r24, r24, r14 @ AES block 4k+3 - round 12 high
+#ifdef __ARMEB__
+ rev r24, r24
+#endif
fmov v3.d[1], r9 @ CTR block 4k+7
aese q0, v20.16b
aesmc q0, q0 @ AES block 4k+4 - round 2
eor r21, r21, r13 @ AES block 4k+2 - round 12 low
-
+#ifdef __ARMEB__
+ rev r21, r21
+#endif
pmull2 v30.1q, q5, v14.2d @ GHASH block 4k+1 - high
eor r22, r22, r14 @ AES block 4k+2 - round 12 high
+#ifdef __ARMEB__
+ rev r22, r22
+#endif
eor q4, q4, q5 @ GHASH block 4k+1 - mid
pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid
eor r23, r23, r13 @ AES block 4k+3 - round 12 low
+#ifdef __ARMEB__
+ rev r23, r23
+#endif
stp r21, r22, [r2], #16 @ AES block 4k+2 - store result
rev64 q7, q7 @ GHASH block 4k+3
@@ -3660,8 +3879,13 @@ aes_gcm_dec_192_kernel:
cmp r5, #48
eor r7, r7, r14 @ AES block 4k+4 - round 12 high
-
+#ifdef __ARMEB__
+ rev r7, r7
+#endif
eor r6, r6, r13 @ AES block 4k+4 - round 12 low
+#ifdef __ARMEB__
+ rev r6, r6
+#endif
bgt .L192_dec_blocks_more_than_3
movi v11.8b, #0
@@ -3705,10 +3929,16 @@ aes_gcm_dec_192_kernel:
pmull2 v9.1q, q4, v15.2d @ GHASH final-3 block - high
eor r6, r6, r13 @ AES final-2 block - round 12 low
+#ifdef __ARMEB__
+ rev r6, r6
+#endif
movi q8, #0 @ suppress further partial tag feed in
pmull v10.1q, v22.1d, v10.1d @ GHASH final-3 block - mid
eor r7, r7, r14 @ AES final-2 block - round 12 high
+#ifdef __ARMEB__
+ rev r7, r7
+#endif
.L192_dec_blocks_more_than_2:@ blocks left > 2
rev64 q4, q5 @ GHASH final-2 block
@@ -3738,8 +3968,13 @@ aes_gcm_dec_192_kernel:
eor q9, q9, v20.16b @ GHASH final-2 block - high
eor r7, r7, r14 @ AES final-1 block - round 12 high
-
+#ifdef __ARMEB__
+ rev r7, r7
+#endif
eor r6, r6, r13 @ AES final-1 block - round 12 low
+#ifdef __ARMEB__
+ rev r6, r6
+#endif
eor v10.16b, v10.16b, v22.16b @ GHASH final-2 block - mid
.L192_dec_blocks_more_than_1:@ blocks left > 1
@@ -3770,9 +4005,13 @@ aes_gcm_dec_192_kernel:
movi q8, #0 @ suppress further partial tag feed in
eor v11.16b, v11.16b, v21.16b @ GHASH final-1 block - low
eor r7, r7, r14 @ AES final block - round 12 high
-
+#ifdef __ARMEB__
+ rev r7, r7
+#endif
eor r6, r6, r13 @ AES final block - round 12 low
-
+#ifdef __ARMEB__
+ rev r6, r6
+#endif
eor v10.16b, v10.16b, v22.16b @ GHASH final-1 block - mid
.L192_dec_blocks_less_than_1:@ blocks left <= 1
@@ -3799,8 +4038,11 @@ aes_gcm_dec_192_kernel:
orr r6, r6, r4
mov v0.d[1], r10
-
+#ifndef __ARMEB__
rev r9, r12
+#else
+ mov r9, r12
+#endif
and q5, q5, q0 @ possibly partial last block has zeroes in highest bits
str r9, [r16, #12] @ store the updated counter
@@ -3888,14 +4130,22 @@ aes_gcm_enc_256_kernel:
lsr r5, r1, #3 @ byte_len
mov r15, r5
ldp r10, r11, [r16] @ ctr96_b64, ctr96_t32
-
+#ifdef __ARMEB__
+ rev r10, r10
+ rev r11, r11
+#endif
+ ldp r13, r14, [r8, #224] @ load rk14
+#ifdef __ARMEB__
+ ror r13, r13, #32
+ ror r14, r14, #32
+#endif
ld1 { q0}, [r16] @ special case vector load initial counter so we can start first AES block as quickly as possible
sub r5, r5, #1 @ byte_len - 1
- ldr q18, [r8, #0] @ load rk0
+ ld1 {v18.4s}, [r8], #16 @ load rk0
and r5, r5, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
- ldr q25, [r8, #112] @ load rk7
+ ld1 {v19.4s}, [r8], #16 @ load rk1
add r5, r5, r0
lsr r12, r11, #32
@@ -3915,14 +4165,14 @@ aes_gcm_enc_256_kernel:
orr r9, r11, r9, lsl #32 @ CTR block 1
add r12, r12, #1 @ CTR block 1
- ldr q19, [r8, #16] @ load rk1
+ ld1 {v20.4s}, [r8], #16 @ load rk2
fmov v1.d[1], r9 @ CTR block 1
rev r9, r12 @ CTR block 2
add r12, r12, #1 @ CTR block 2
orr r9, r11, r9, lsl #32 @ CTR block 2
- ldr q20, [r8, #32] @ load rk2
+ ld1 {v21.4s}, [r8], #16 @ load rk3
fmov v2.d[1], r9 @ CTR block 2
rev r9, r12 @ CTR block 3
@@ -3935,50 +4185,53 @@ aes_gcm_enc_256_kernel:
aese q1, v18.16b
aesmc q1, q1 @ AES block 1 - round 0
- ldr q21, [r8, #48] @ load rk3
+ ld1 {v22.4s}, [r8], #16 @ load rk4
aese q0, v20.16b
aesmc q0, q0 @ AES block 0 - round 2
- ldr q24, [r8, #96] @ load rk6
+ ld1 {v23.4s}, [r8], #16 @ load rk5
aese q2, v18.16b
aesmc q2, q2 @ AES block 2 - round 0
- ldr q23, [r8, #80] @ load rk5
+ ld1 {v24.4s}, [r8], #16 @ load rk6
aese q1, v19.16b
aesmc q1, q1 @ AES block 1 - round 1
ldr q14, [r3, #80] @ load h3l | h3h
+#ifndef __ARMEB__
ext v14.16b, v14.16b, v14.16b, #8
-
+#endif
aese q3, v18.16b
aesmc q3, q3 @ AES block 3 - round 0
- ldr q31, [r8, #208] @ load rk13
+ ld1 {v25.4s}, [r8], #16 @ load rk7
aese q2, v19.16b
aesmc q2, q2 @ AES block 2 - round 1
- ldr q22, [r8, #64] @ load rk4
+ ld1 {v26.4s}, [r8], #16 @ load rk8
aese q1, v20.16b
aesmc q1, q1 @ AES block 1 - round 2
ldr q13, [r3, #64] @ load h2l | h2h
+#ifndef __ARMEB__
ext v13.16b, v13.16b, v13.16b, #8
-
+#endif
aese q3, v19.16b
aesmc q3, q3 @ AES block 3 - round 1
- ldr q30, [r8, #192] @ load rk12
+ ld1 {v27.4s}, [r8], #16 @ load rk9
aese q2, v20.16b
aesmc q2, q2 @ AES block 2 - round 2
ldr q15, [r3, #112] @ load h4l | h4h
+#ifndef __ARMEB__
ext v15.16b, v15.16b, v15.16b, #8
-
+#endif
aese q1, v21.16b
aesmc q1, q1 @ AES block 1 - round 3
- ldr q29, [r8, #176] @ load rk11
+ ld1 {v28.4s}, [r8], #16 @ load rk10
aese q3, v20.16b
aesmc q3, q3 @ AES block 3 - round 2
- ldr q26, [r8, #128] @ load rk8
+ ld1 {v29.4s}, [r8], #16 @ load rk11
aese q2, v21.16b
aesmc q2, q2 @ AES block 2 - round 3
@@ -3986,7 +4239,6 @@ aes_gcm_enc_256_kernel:
aese q0, v21.16b
aesmc q0, q0 @ AES block 0 - round 3
- ldp r13, r14, [r8, #224] @ load rk14
aese q3, v21.16b
aesmc q3, q3 @ AES block 3 - round 3
@@ -4024,16 +4276,17 @@ aes_gcm_enc_256_kernel:
aese q3, v24.16b
aesmc q3, q3 @ AES block 3 - round 6
- ldr q27, [r8, #144] @ load rk9
+ ld1 {v30.4s}, [r8], #16 @ load rk12
aese q0, v24.16b
aesmc q0, q0 @ AES block 0 - round 6
ldr q12, [r3, #32] @ load h1l | h1h
+#ifndef __ARMEB__
ext v12.16b, v12.16b, v12.16b, #8
-
+#endif
aese q2, v24.16b
aesmc q2, q2 @ AES block 2 - round 6
- ldr q28, [r8, #160] @ load rk10
+ ld1 {v31.4s}, [r8], #16 @ load rk13
aese q1, v25.16b
aesmc q1, q1 @ AES block 1 - round 7
@@ -4122,13 +4375,26 @@ aes_gcm_enc_256_kernel:
bge .L256_enc_tail @ handle tail
ldp r19, r20, [r0, #16] @ AES block 1 - load plaintext
-
+#ifdef __ARMEB__
+ rev r19, r19
+ rev r20, r20
+#endif
rev r9, r12 @ CTR block 4
ldp r6, r7, [r0, #0] @ AES block 0 - load plaintext
-
+#ifdef __ARMEB__
+ rev r6, r6
+ rev r7, r7
+#endif
ldp r23, r24, [r0, #48] @ AES block 3 - load plaintext
-
+#ifdef __ARMEB__
+ rev r23, r23
+ rev r24, r24
+#endif
ldp r21, r22, [r0, #32] @ AES block 2 - load plaintext
+#ifdef __ARMEB__
+ rev r21, r21
+ rev r22, r22
+#endif
add r0, r0, #64 @ AES input_ptr update
eor r19, r19, r13 @ AES block 1 - round 14 low
@@ -4211,11 +4477,17 @@ aes_gcm_enc_256_kernel:
aese q1, v19.16b
aesmc q1, q1 @ AES block 4k+5 - round 1
ldp r23, r24, [r0, #48] @ AES block 4k+7 - load plaintext
-
+#ifdef __ARMEB__
+ rev r23, r23
+ rev r24, r24
+#endif
aese q2, v19.16b
aesmc q2, q2 @ AES block 4k+6 - round 1
ldp r21, r22, [r0, #32] @ AES block 4k+6 - load plaintext
-
+#ifdef __ARMEB__
+ rev r21, r21
+ rev r22, r22
+#endif
aese q0, v20.16b
aesmc q0, q0 @ AES block 4k+4 - round 2
eor q4, q4, v11.16b @ PRE 1
@@ -4325,7 +4597,10 @@ aes_gcm_enc_256_kernel:
aese q3, v24.16b
aesmc q3, q3 @ AES block 4k+7 - round 6
ldp r19, r20, [r0, #16] @ AES block 4k+5 - load plaintext
-
+#ifdef __ARMEB__
+ rev r19, r19
+ rev r20, r20
+#endif
aese q1, v26.16b
aesmc q1, q1 @ AES block 4k+5 - round 8
mov d4, v7.d[1] @ GHASH block 4k+3 - mid
@@ -4362,7 +4637,10 @@ aes_gcm_enc_256_kernel:
aese q2, v26.16b
aesmc q2, q2 @ AES block 4k+6 - round 8
ldp r6, r7, [r0, #0] @ AES block 4k+4 - load plaintext
-
+#ifdef __ARMEB__
+ rev r6, r6
+ rev r7, r7
+#endif
aese q0, v28.16b
aesmc q0, q0 @ AES block 4k+4 - round 10
shl d8, d8, #56 @ mod_constant
@@ -4724,7 +5002,10 @@ aes_gcm_enc_256_kernel:
ext q8, v11.16b, v11.16b, #8 @ prepare final partial tag
sub r5, r4, r0 @ main_end_input_ptr is number of bytes left to process
ldp r6, r7, [r0], #16 @ AES block 4k+4 - load plaintext
-
+#ifdef __ARMEB__
+ rev r6, r6
+ rev r7, r7
+#endif
eor r6, r6, r13 @ AES block 4k+4 - round 14 low
eor r7, r7, r14 @ AES block 4k+4 - round 14 high
@@ -4759,7 +5040,10 @@ aes_gcm_enc_256_kernel:
st1 { q5}, [r2], #16 @ AES final-3 block - store result
ldp r6, r7, [r0], #16 @ AES final-2 block - load input low & high
-
+#ifdef __ARMEB__
+ rev r6, r6
+ rev r7, r7
+#endif
rev64 q4, q5 @ GHASH final-3 block
eor r6, r6, r13 @ AES final-2 block - round 14 low
@@ -4788,7 +5072,10 @@ aes_gcm_enc_256_kernel:
st1 { q5}, [r2], #16 @ AES final-2 block - store result
ldp r6, r7, [r0], #16 @ AES final-1 block - load input low & high
-
+#ifdef __ARMEB__
+ rev r6, r6
+ rev r7, r7
+#endif
rev64 q4, q5 @ GHASH final-2 block
eor r6, r6, r13 @ AES final-1 block - round 14 low
@@ -4824,7 +5111,10 @@ aes_gcm_enc_256_kernel:
rev64 q4, q5 @ GHASH final-1 block
ldp r6, r7, [r0], #16 @ AES final block - load input low & high
-
+#ifdef __ARMEB__
+ rev r6, r6
+ rev r7, r7
+#endif
eor q4, q4, q8 @ feed in partial tag
movi q8, #0 @ suppress further partial tag feed in
@@ -4885,7 +5175,11 @@ aes_gcm_enc_256_kernel:
pmull2 v20.1q, q4, v12.2d @ GHASH final block - high
mov d8, v4.d[1] @ GHASH final block - mid
+#ifndef __ARMEB__
rev r9, r12
+#else
+ mov r9, r12
+#endif
pmull v21.1q, q4, v12.1d @ GHASH final block - low
@@ -4959,21 +5253,29 @@ aes_gcm_dec_256_kernel:
lsr r5, r1, #3 @ byte_len
mov r15, r5
ldp r10, r11, [r16] @ ctr96_b64, ctr96_t32
-
- ldr q26, [r8, #128] @ load rk8
+#ifdef __ARMEB__
+ rev r10, r10
+ rev r11, r11
+#endif
+ ldp r13, r14, [r8, #224] @ load rk14
+#ifdef __ARMEB__
+ ror r14, r14, #32
+ ror r13, r13, #32
+#endif
+ ld1 {v18.4s}, [r8], #16 @ load rk0
sub r5, r5, #1 @ byte_len - 1
- ldr q25, [r8, #112] @ load rk7
+ ld1 {v19.4s}, [r8], #16 @ load rk1
and r5, r5, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
add r4, r0, r1, lsr #3 @ end_input_ptr
- ldr q24, [r8, #96] @ load rk6
+ ld1 {v20.4s}, [r8], #16 @ load rk2
lsr r12, r11, #32
- ldr q23, [r8, #80] @ load rk5
+ ld1 {v21.4s}, [r8], #16 @ load rk3
orr r11, r11, r11
- ldr q21, [r8, #48] @ load rk3
+ ld1 {v22.4s}, [r8], #16 @ load rk4
add r5, r5, r0
rev r12, r12 @ rev_ctr32
@@ -4998,39 +5300,44 @@ aes_gcm_dec_256_kernel:
rev r9, r12 @ CTR block 3
orr r9, r11, r9, lsl #32 @ CTR block 3
- ldr q18, [r8, #0] @ load rk0
+ ld1 {v23.4s}, [r8], #16 @ load rk5
fmov v3.d[1], r9 @ CTR block 3
add r12, r12, #1 @ CTR block 3
- ldr q22, [r8, #64] @ load rk4
+ ld1 {v24.4s}, [r8], #16 @ load rk6
- ldr q31, [r8, #208] @ load rk13
+ ld1 {v25.4s}, [r8], #16 @ load rk7
- ldr q19, [r8, #16] @ load rk1
+ ld1 {v26.4s}, [r8], #16 @ load rk8
aese q0, v18.16b
aesmc q0, q0 @ AES block 0 - round 0
ldr q14, [r3, #80] @ load h3l | h3h
+#ifndef __ARMEB__
ext v14.16b, v14.16b, v14.16b, #8
+#endif
aese q3, v18.16b
aesmc q3, q3 @ AES block 3 - round 0
ldr q15, [r3, #112] @ load h4l | h4h
+#ifndef __ARMEB__
ext v15.16b, v15.16b, v15.16b, #8
+#endif
aese q1, v18.16b
aesmc q1, q1 @ AES block 1 - round 0
ldr q13, [r3, #64] @ load h2l | h2h
+#ifndef __ARMEB__
ext v13.16b, v13.16b, v13.16b, #8
+#endif
aese q2, v18.16b
aesmc q2, q2 @ AES block 2 - round 0
- ldr q20, [r8, #32] @ load rk2
+ ld1 {v27.4s}, [r8], #16 @ load rk9
aese q0, v19.16b
aesmc q0, q0 @ AES block 0 - round 1
- ldp r13, r14, [r8, #224] @ load rk14
aese q1, v19.16b
aesmc q1, q1 @ AES block 1 - round 1
@@ -5040,20 +5347,21 @@ aes_gcm_dec_256_kernel:
aese q2, v19.16b
aesmc q2, q2 @ AES block 2 - round 1
- ldr q27, [r8, #144] @ load rk9
+ ld1 {v28.4s}, [r8], #16 @ load rk10
aese q3, v19.16b
aesmc q3, q3 @ AES block 3 - round 1
- ldr q30, [r8, #192] @ load rk12
+ ld1 {v29.4s}, [r8], #16 @ load rk11
aese q0, v20.16b
aesmc q0, q0 @ AES block 0 - round 2
ldr q12, [r3, #32] @ load h1l | h1h
+#ifndef __ARMEB__
ext v12.16b, v12.16b, v12.16b, #8
-
+#endif
aese q2, v20.16b
aesmc q2, q2 @ AES block 2 - round 2
- ldr q28, [r8, #160] @ load rk10
+ ld1 {v30.4s}, [r8], #16 @ load rk12
aese q3, v20.16b
aesmc q3, q3 @ AES block 3 - round 2
@@ -5136,7 +5444,7 @@ aes_gcm_dec_256_kernel:
aese q2, v26.16b
aesmc q2, q2 @ AES block 2 - round 8
- ldr q29, [r8, #176] @ load rk11
+ ld1 {v31.4s}, [r8], #16 @ load rk13
aese q1, v27.16b
aesmc q1, q1 @ AES block 1 - round 9
@@ -5201,9 +5509,7 @@ aes_gcm_dec_256_kernel:
aese q0, v31.16b @ AES block 0 - round 13
bge .L256_dec_tail @ handle tail
- ldr q4, [r0, #0] @ AES block 0 - load ciphertext
-
- ldr q5, [r0, #16] @ AES block 1 - load ciphertext
+ ld1 {q4, q5}, [r0], #32 @ AES block 0,1 - load ciphertext
rev r9, r12 @ CTR block 4
@@ -5211,7 +5517,7 @@ aes_gcm_dec_256_kernel:
eor q1, q5, q1 @ AES block 1 - result
rev64 q5, q5 @ GHASH block 1
- ldr q7, [r0, #48] @ AES block 3 - load ciphertext
+ ld1 {q6}, [r0], #16 @ AES block 2 - load ciphertext
mov r7, v0.d[1] @ AES block 0 - mov high
@@ -5231,22 +5537,32 @@ aes_gcm_dec_256_kernel:
orr r9, r11, r9, lsl #32 @ CTR block 5
mov r20, v1.d[1] @ AES block 1 - mov high
eor r7, r7, r14 @ AES block 0 - round 14 high
-
+#ifdef __ARMEB__
+ rev r7, r7
+#endif
eor r6, r6, r13 @ AES block 0 - round 14 low
+#ifdef __ARMEB__
+ rev r6, r6
+#endif
stp r6, r7, [r2], #16 @ AES block 0 - store result
fmov d1, r10 @ CTR block 5
- ldr q6, [r0, #32] @ AES block 2 - load ciphertext
- add r0, r0, #64 @ AES input_ptr update
+ ld1 {q7}, [r0], #16 @ AES block 3 - load ciphertext
fmov v1.d[1], r9 @ CTR block 5
rev r9, r12 @ CTR block 6
add r12, r12, #1 @ CTR block 6
eor r19, r19, r13 @ AES block 1 - round 14 low
+#ifdef __ARMEB__
+ rev r19, r19
+#endif
orr r9, r11, r9, lsl #32 @ CTR block 6
eor r20, r20, r14 @ AES block 1 - round 14 high
+#ifdef __ARMEB__
+ rev r20, r20
+#endif
stp r19, r20, [r2], #16 @ AES block 1 - store result
eor q2, q6, q2 @ AES block 2 - result
@@ -5297,7 +5613,9 @@ aes_gcm_dec_256_kernel:
aese q0, v21.16b
aesmc q0, q0 @ AES block 4k+4 - round 3
eor r22, r22, r14 @ AES block 4k+2 - round 14 high
-
+#ifdef __ARMEB__
+ rev r22, r22
+#endif
aese q2, v19.16b
aesmc q2, q2 @ AES block 4k+6 - round 1
mov d10, v17.d[1] @ GHASH block 4k - mid
@@ -5309,7 +5627,9 @@ aes_gcm_dec_256_kernel:
aese q3, v18.16b
aesmc q3, q3 @ AES block 4k+7 - round 0
eor r21, r21, r13 @ AES block 4k+2 - round 14 low
-
+#ifdef __ARMEB__
+ rev r21, r21
+#endif
aese q2, v20.16b
aesmc q2, q2 @ AES block 4k+6 - round 2
stp r21, r22, [r2], #16 @ AES block 4k+2 - store result
@@ -5324,9 +5644,14 @@ aes_gcm_dec_256_kernel:
pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid
eor r23, r23, r13 @ AES block 4k+3 - round 14 low
-
+#ifdef __ARMEB__
+ rev r23, r23
+#endif
pmull v8.1q, q5, v14.1d @ GHASH block 4k+1 - low
eor r24, r24, r14 @ AES block 4k+3 - round 14 high
+#ifdef __ARMEB__
+ rev r24, r24
+#endif
eor q9, q9, q4 @ GHASH block 4k+1 - high
aese q2, v22.16b
@@ -5447,7 +5772,7 @@ aes_gcm_dec_256_kernel:
aese q1, v27.16b
aesmc q1, q1 @ AES block 4k+5 - round 9
- ldr q4, [r0, #0] @ AES block 4k+4 - load ciphertext
+ ld1 {q4}, [r0], #16 @ AES block 4k+4 - load ciphertext
aese q0, v31.16b @ AES block 4k+4 - round 13
ext q9, q9, q9, #8 @ MODULO - other top alignment
@@ -5458,7 +5783,7 @@ aes_gcm_dec_256_kernel:
aese q2, v27.16b
aesmc q2, q2 @ AES block 4k+6 - round 9
- ldr q5, [r0, #16] @ AES block 4k+5 - load ciphertext
+ ld1 {q5}, [r0], #16 @ AES block 4k+5 - load ciphertext
aese q3, v26.16b
aesmc q3, q3 @ AES block 4k+7 - round 8
@@ -5474,11 +5799,11 @@ aes_gcm_dec_256_kernel:
aese q3, v27.16b
aesmc q3, q3 @ AES block 4k+7 - round 9
- ldr q7, [r0, #48] @ AES block 4k+7 - load ciphertext
+ ld1 {q6}, [r0], #16 @ AES block 4k+6 - load ciphertext
aese q1, v30.16b
aesmc q1, q1 @ AES block 4k+5 - round 12
- ldr q6, [r0, #32] @ AES block 4k+6 - load ciphertext
+ ld1 {q7}, [r0], #16 @ AES block 4k+7 - load ciphertext
aese q2, v29.16b
aesmc q2, q2 @ AES block 4k+6 - round 11
@@ -5489,7 +5814,6 @@ aes_gcm_dec_256_kernel:
eor v10.16b, v10.16b, q9 @ MODULO - fold into mid
aese q1, v31.16b @ AES block 4k+5 - round 13
- add r0, r0, #64 @ AES input_ptr update
mov r6, v0.d[0] @ AES block 4k+4 - mov low
aese q2, v30.16b
@@ -5511,8 +5835,13 @@ aes_gcm_dec_256_kernel:
add r12, r12, #1 @ CTR block 4k+9
eor r6, r6, r13 @ AES block 4k+4 - round 14 low
+#ifdef __ARMEB__
+ rev r6, r6
+#endif
eor r7, r7, r14 @ AES block 4k+4 - round 14 high
-
+#ifdef __ARMEB__
+ rev r7, r7
+#endif
mov r20, v1.d[1] @ AES block 4k+5 - mov high
eor q2, q6, q2 @ AES block 4k+6 - result
eor v11.16b, v11.16b, q8 @ MODULO - fold into low
@@ -5533,9 +5862,15 @@ aes_gcm_dec_256_kernel:
rev64 q5, q5 @ GHASH block 4k+5
eor r20, r20, r14 @ AES block 4k+5 - round 14 high
+#ifdef __ARMEB__
+ rev r20, r20
+#endif
stp r6, r7, [r2], #16 @ AES block 4k+4 - store result
eor r19, r19, r13 @ AES block 4k+5 - round 14 low
+#ifdef __ARMEB__
+ rev r19, r19
+#endif
stp r19, r20, [r2], #16 @ AES block 4k+5 - store result
rev64 q4, q4 @ GHASH block 4k+4
@@ -5742,11 +6077,15 @@ aes_gcm_dec_256_kernel:
aese q0, v28.16b
aesmc q0, q0 @ AES block 4k+4 - round 10
eor r22, r22, r14 @ AES block 4k+2 - round 14 high
-
+#ifdef __ARMEB__
+ rev r22, r22
+#endif
aese q1, v28.16b
aesmc q1, q1 @ AES block 4k+5 - round 10
eor r23, r23, r13 @ AES block 4k+3 - round 14 low
-
+#ifdef __ARMEB__
+ rev r23, r23
+#endif
aese q2, v29.16b
aesmc q2, q2 @ AES block 4k+6 - round 11
eor v10.16b, v10.16b, q9 @ MODULO - fold into mid
@@ -5758,12 +6097,18 @@ aes_gcm_dec_256_kernel:
aese q1, v29.16b
aesmc q1, q1 @ AES block 4k+5 - round 11
eor r21, r21, r13 @ AES block 4k+2 - round 14 low
+#ifdef __ARMEB__
+ rev r21, r21
+#endif
aese q2, v30.16b
aesmc q2, q2 @ AES block 4k+6 - round 12
pmull v8.1q, v10.1d, q8 @ MODULO - mid 64b align with low
eor r24, r24, r14 @ AES block 4k+3 - round 14 high
+#ifdef __ARMEB__
+ rev r24, r24
+#endif
aese q3, v29.16b
aesmc q3, q3 @ AES block 4k+7 - round 11
@@ -5804,8 +6149,14 @@ aes_gcm_dec_256_kernel:
cmp r5, #48
eor r6, r6, r13 @ AES block 4k+4 - round 14 low
+#ifdef __ARMEB__
+ rev r6, r6
+#endif
eor r7, r7, r14 @ AES block 4k+4 - round 14 high
+#ifdef __ARMEB__
+ rev r7, r7
+#endif
bgt .L256_dec_blocks_more_than_3
sub r12, r12, #1
@@ -5853,9 +6204,15 @@ aes_gcm_dec_256_kernel:
pmull v10.1q, v22.1d, v10.1d @ GHASH final-3 block - mid
eor r6, r6, r13 @ AES final-2 block - round 14 low
+#ifdef __ARMEB__
+ rev r6, r6
+#endif
pmull v11.1q, q4, v15.1d @ GHASH final-3 block - low
eor r7, r7, r14 @ AES final-2 block - round 14 high
+#ifdef __ARMEB__
+ rev r7, r7
+#endif
.L256_dec_blocks_more_than_2:@ blocks left > 2
rev64 q4, q5 @ GHASH final-2 block
@@ -5883,9 +6240,15 @@ aes_gcm_dec_256_kernel:
eor q9, q9, v20.16b @ GHASH final-2 block - high
eor r6, r6, r13 @ AES final-1 block - round 14 low
+#ifdef __ARMEB__
+ rev r6, r6
+#endif
eor v10.16b, v10.16b, v22.16b @ GHASH final-2 block - mid
eor r7, r7, r14 @ AES final-1 block - round 14 high
+#ifdef __ARMEB__
+ rev r7, r7
+#endif
.L256_dec_blocks_more_than_1:@ blocks left > 1
stp r6, r7, [r2], #16 @ AES final-1 block - store result
@@ -5913,13 +6276,18 @@ aes_gcm_dec_256_kernel:
pmull2 v22.1q, v22.2d, v16.2d @ GHASH final-1 block - mid
eor r6, r6, r13 @ AES final block - round 14 low
-
+#ifdef __ARMEB__
+ rev r6, r6
+#endif
eor v11.16b, v11.16b, v21.16b @ GHASH final-1 block - low
eor q9, q9, v20.16b @ GHASH final-1 block - high
eor v10.16b, v10.16b, v22.16b @ GHASH final-1 block - mid
eor r7, r7, r14 @ AES final block - round 14 high
+#ifdef __ARMEB__
+ rev r7, r7
+#endif
.L256_dec_blocks_less_than_1:@ blocks left <= 1
and r1, r1, #127 @ bit_length %= 128
@@ -5945,7 +6313,11 @@ aes_gcm_dec_256_kernel:
mov v0.d[1], r10
bic r4, r4, r9 @ mask out low existing bytes
+#ifndef __ARMEB__
rev r9, r12
+#else
+ mov r9, r12
+#endif
bic r5, r5, r10 @ mask out high existing bytes
Index: src/crypto/external/bsd/openssl/lib/libcrypto/arch/arm/bsaes-armv7.S
diff -u src/crypto/external/bsd/openssl/lib/libcrypto/arch/arm/bsaes-armv7.S:1.6 src/crypto/external/bsd/openssl/lib/libcrypto/arch/arm/bsaes-armv7.S:1.7
--- src/crypto/external/bsd/openssl/lib/libcrypto/arch/arm/bsaes-armv7.S:1.6 Tue May 9 13:21:16 2023
+++ src/crypto/external/bsd/openssl/lib/libcrypto/arch/arm/bsaes-armv7.S Wed May 31 15:35:31 2023
@@ -1,5 +1,5 @@
#include "arm_asm.h"
-@ Copyright 2012-2021 The OpenSSL Project Authors. All Rights Reserved.
+@ Copyright 2012-2023 The OpenSSL Project Authors. All Rights Reserved.
@
@ Licensed under the Apache License 2.0 (the "License"). You may not use
@ this file except in compliance with the License. You can obtain a copy
@@ -14,7 +14,7 @@
@ details see http://www.openssl.org/~appro/cryptogams/.
@
@ Specific modes and adaptation for Linux kernel by Ard Biesheuvel
-@ of Linaro. Permission to use under GPL terms is granted.
+@ of Linaro.
@ ====================================================================
@ Bit-sliced AES for ARM NEON
@@ -1394,7 +1394,7 @@ ossl_bsaes_ctr32_encrypt_blocks:
.align 2
add r12, r3, #248
vld1.8 {q0}, [r8] @ load counter
- adrl r8, .LREVM0SR @ borrow r8
+ add r8, r6, #.LREVM0SR-.LM0 @ borrow r8
vldmia r12, {q4} @ load round0 key
sub sp, #0x10 @ place for adjusted round0 key
#endif
Index: src/crypto/external/bsd/openssl/lib/libcrypto/arch/powerpc/ecp_nistp521-ppc64.S
diff -u src/crypto/external/bsd/openssl/lib/libcrypto/arch/powerpc/ecp_nistp521-ppc64.S:1.1 src/crypto/external/bsd/openssl/lib/libcrypto/arch/powerpc/ecp_nistp521-ppc64.S:1.2
--- src/crypto/external/bsd/openssl/lib/libcrypto/arch/powerpc/ecp_nistp521-ppc64.S:1.1 Tue May 9 13:22:44 2023
+++ src/crypto/external/bsd/openssl/lib/libcrypto/arch/powerpc/ecp_nistp521-ppc64.S Wed May 31 15:35:31 2023
@@ -1,3 +1,4 @@
+.machine "any"
.text
.globl p521_felem_mul
Index: src/crypto/external/bsd/openssl/lib/libcrypto/arch/powerpc64/ecp_nistp521-ppc64.S
diff -u src/crypto/external/bsd/openssl/lib/libcrypto/arch/powerpc64/ecp_nistp521-ppc64.S:1.1 src/crypto/external/bsd/openssl/lib/libcrypto/arch/powerpc64/ecp_nistp521-ppc64.S:1.2
--- src/crypto/external/bsd/openssl/lib/libcrypto/arch/powerpc64/ecp_nistp521-ppc64.S:1.1 Tue May 9 13:22:44 2023
+++ src/crypto/external/bsd/openssl/lib/libcrypto/arch/powerpc64/ecp_nistp521-ppc64.S Wed May 31 15:35:31 2023
@@ -1,3 +1,4 @@
+.machine "any"
.text
.globl p521_felem_mul