Replace the literal load of the addend vector with a sequence that
composes it using immediates. While at it, tweak the code that refers
to it so it does not clobber the register, so we can take the load
out of the loop as well.

This results in generally better code, but also works around a Clang
issue, whose integrated assembler does not implement the GNU ARM asm
syntax completely, and does not support the =literal notation for
FP registers.

Cc: Nick Desaulniers <ndesaulni...@google.com>
Signed-off-by: Ard Biesheuvel <ard.biesheu...@linaro.org>
---
 arch/arm64/crypto/aes-modes.S | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S
index 483a7130cf0e..e966620ee230 100644
--- a/arch/arm64/crypto/aes-modes.S
+++ b/arch/arm64/crypto/aes-modes.S
@@ -225,6 +225,14 @@ AES_ENTRY(aes_ctr_encrypt)
        enc_prepare     w22, x21, x6
        ld1             {v4.16b}, [x24]
 
+       /* compose addend vector { 1, 2, 3, 0 } in v8.4s */
+       movi            v7.4h, #1
+       movi            v8.4h, #2
+       uaddl           v6.4s, v7.4h, v8.4h
+       zip1            v8.8h, v7.8h, v8.8h
+       zip1            v8.4s, v8.4s, v6.4s
+       zip2            v8.8h, v8.8h, v7.8h
+
        umov            x6, v4.d[1]             /* keep swabbed ctr in reg */
        rev             x6, x6
 .LctrloopNx:
@@ -232,17 +240,16 @@ AES_ENTRY(aes_ctr_encrypt)
        bmi             .Lctr1x
        cmn             w6, #4                  /* 32 bit overflow? */
        bcs             .Lctr1x
-       ldr             q8, =0x30000000200000001        /* addends 1,2,3[,0] */
        dup             v7.4s, w6
        mov             v0.16b, v4.16b
        add             v7.4s, v7.4s, v8.4s
        mov             v1.16b, v4.16b
-       rev32           v8.16b, v7.16b
+       rev32           v7.16b, v7.16b
        mov             v2.16b, v4.16b
        mov             v3.16b, v4.16b
-       mov             v1.s[3], v8.s[0]
-       mov             v2.s[3], v8.s[1]
-       mov             v3.s[3], v8.s[2]
+       mov             v1.s[3], v7.s[0]
+       mov             v2.s[3], v7.s[1]
+       mov             v3.s[3], v7.s[2]
        ld1             {v5.16b-v7.16b}, [x20], #48     /* get 3 input blocks */
        bl              aes_encrypt_block4x
        eor             v0.16b, v5.16b, v0.16b
@@ -296,7 +303,6 @@ AES_ENTRY(aes_ctr_encrypt)
        ins             v4.d[0], x7
        b               .Lctrcarrydone
 AES_ENDPROC(aes_ctr_encrypt)
-       .ltorg
 
 
        /*
-- 
2.17.1

Reply via email to