This updates both the core GHASH as well as the AES-GCM algorithm to
yield each time after processing a fixed chunk of input. For the GCM
driver, we align with the other AES/CE block mode drivers, and use
a block size of 64 bytes. The core GHASH is much shorter, so let's
use a block size of 128 bytes for that one.

Signed-off-by: Ard Biesheuvel <ard.biesheu...@linaro.org>
---
 arch/arm64/crypto/ghash-ce-core.S | 128 ++++++++++++++------
 1 file changed, 92 insertions(+), 36 deletions(-)

diff --git a/arch/arm64/crypto/ghash-ce-core.S 
b/arch/arm64/crypto/ghash-ce-core.S
index 11ebf1ae248a..fbfd4681675d 100644
--- a/arch/arm64/crypto/ghash-ce-core.S
+++ b/arch/arm64/crypto/ghash-ce-core.S
@@ -212,23 +212,36 @@
        ushr            XL.2d, XL.2d, #1
        .endm
 
-       .macro          __pmull_ghash, pn
-       ld1             {SHASH.2d}, [x3]
-       ld1             {XL.2d}, [x1]
+       .macro          __pmull_ghash, pn, yield
+       stp             x29, x30, [sp, #-64]!
+       mov             x29, sp
+       stp             x19, x20, [sp, #16]
+       stp             x21, x22, [sp, #32]
+       str             x23, [sp, #48]
+
+       mov             x19, x0
+       mov             x20, x1
+       mov             x21, x2
+       mov             x22, x3
+       mov             x23, x4
+
+0:     ld1             {SHASH.2d}, [x22]
+       ld1             {XL.2d}, [x20]
        ext             SHASH2.16b, SHASH.16b, SHASH.16b, #8
        eor             SHASH2.16b, SHASH2.16b, SHASH.16b
 
        __pmull_pre_\pn
 
        /* do the head block first, if supplied */
-       cbz             x4, 0f
-       ld1             {T1.2d}, [x4]
-       b               1f
+       cbz             x23, 1f
+       ld1             {T1.2d}, [x23]
+       mov             x23, xzr
+       b               2f
 
-0:     ld1             {T1.2d}, [x2], #16
-       sub             w0, w0, #1
+1:     ld1             {T1.2d}, [x21], #16
+       sub             w19, w19, #1
 
-1:     /* multiply XL by SHASH in GF(2^128) */
+2:     /* multiply XL by SHASH in GF(2^128) */
 CPU_LE(        rev64           T1.16b, T1.16b  )
 
        ext             T2.16b, XL.16b, XL.16b, #8
@@ -250,9 +263,19 @@ CPU_LE(    rev64           T1.16b, T1.16b  )
        eor             T2.16b, T2.16b, XH.16b
        eor             XL.16b, XL.16b, T2.16b
 
-       cbnz            w0, 0b
+       cbz             w19, 3f
 
-       st1             {XL.2d}, [x1]
+       yield_neon_pre  w19, \yield, 1, 1b
+       st1             {XL.2d}, [x20]
+       yield_neon_post 0b
+
+       b               1b
+
+3:     st1             {XL.2d}, [x20]
+       ldp             x19, x20, [sp, #16]
+       ldp             x21, x22, [sp, #32]
+       ldr             x23, [sp, #48]
+       ldp             x29, x30, [sp], #64
        ret
        .endm
 
@@ -261,11 +284,11 @@ CPU_LE(   rev64           T1.16b, T1.16b  )
         *                         struct ghash_key const *k, const char *head)
         */
 ENTRY(pmull_ghash_update_p64)
-       __pmull_ghash   p64
+       __pmull_ghash   p64, 5
 ENDPROC(pmull_ghash_update_p64)
 
 ENTRY(pmull_ghash_update_p8)
-       __pmull_ghash   p8
+       __pmull_ghash   p8, 2
 ENDPROC(pmull_ghash_update_p8)
 
        KS              .req    v8
@@ -304,38 +327,56 @@ ENDPROC(pmull_ghash_update_p8)
        .endm
 
        .macro          pmull_gcm_do_crypt, enc
-       ld1             {SHASH.2d}, [x4]
-       ld1             {XL.2d}, [x1]
-       ldr             x8, [x5, #8]                    // load lower counter
+       stp             x29, x30, [sp, #-96]!
+       mov             x29, sp
+       stp             x19, x20, [sp, #16]
+       stp             x21, x22, [sp, #32]
+       stp             x23, x24, [sp, #48]
+       stp             x25, x26, [sp, #64]
+       str             x27, [sp, #80]
+
+       mov             x19, x0
+       mov             x20, x1
+       mov             x21, x2
+       mov             x22, x3
+       mov             x23, x4
+       mov             x24, x5
+       mov             x25, x6
+       mov             x26, x7
+
+       ldr             x27, [x24, #8]                  // load lower counter
+CPU_LE(        rev             x27, x27        )
+
+0:     ld1             {SHASH.2d}, [x23]
+       ld1             {XL.2d}, [x20]
 
        movi            MASK.16b, #0xe1
        ext             SHASH2.16b, SHASH.16b, SHASH.16b, #8
-CPU_LE(        rev             x8, x8          )
        shl             MASK.2d, MASK.2d, #57
        eor             SHASH2.16b, SHASH2.16b, SHASH.16b
 
        .if             \enc == 1
-       ld1             {KS.16b}, [x7]
+       ld1             {KS.16b}, [x26]
        .endif
 
-0:     ld1             {CTR.8b}, [x5]                  // load upper counter
-       ld1             {INP.16b}, [x3], #16
-       rev             x9, x8
-       add             x8, x8, #1
-       sub             w0, w0, #1
+1:     ld1             {CTR.8b}, [x24]                 // load upper counter
+       ld1             {INP.16b}, [x22], #16
+       rev             x9, x27
+       add             x27, x27, #1
+       sub             w19, w19, #1
        ins             CTR.d[1], x9                    // set lower counter
 
        .if             \enc == 1
        eor             INP.16b, INP.16b, KS.16b        // encrypt input
-       st1             {INP.16b}, [x2], #16
+       st1             {INP.16b}, [x21], #16
        .endif
 
        rev64           T1.16b, INP.16b
 
-       cmp             w6, #12
-       b.ge            2f                              // AES-192/256?
+       cmp             w25, #12
+       b.ge            4f                              // AES-192/256?
 
-1:     enc_round       CTR, v21
+2:     enc_round       CTR, v21
 
        ext             T2.16b, XL.16b, XL.16b, #8
        ext             IN1.16b, T1.16b, T1.16b, #8
@@ -390,27 +431,42 @@ CPU_LE(   rev             x8, x8          )
 
        .if             \enc == 0
        eor             INP.16b, INP.16b, KS.16b
-       st1             {INP.16b}, [x2], #16
+       st1             {INP.16b}, [x21], #16
        .endif
 
-       cbnz            w0, 0b
+       cbz             w19, 3f
 
-CPU_LE(        rev             x8, x8          )
-       st1             {XL.2d}, [x1]
-       str             x8, [x5, #8]                    // store lower counter
+       yield_neon_pre  w19, 8, 1, 1b                   // yield every 8 blocks
+       st1             {XL.2d}, [x20]
+       .if             \enc == 1
+       st1             {KS.16b}, [x26]
+       .endif
+       yield_neon_post 0b
 
+       b               1b
+
+3:     st1             {XL.2d}, [x20]
        .if             \enc == 1
-       st1             {KS.16b}, [x7]
+       st1             {KS.16b}, [x26]
        .endif
 
+CPU_LE(        rev             x27, x27        )
+       str             x27, [x24, #8]                  // store lower counter
+
+       ldp             x19, x20, [sp, #16]
+       ldp             x21, x22, [sp, #32]
+       ldp             x23, x24, [sp, #48]
+       ldp             x25, x26, [sp, #64]
+       ldr             x27, [sp, #80]
+       ldp             x29, x30, [sp], #96
        ret
 
-2:     b.eq            3f                              // AES-192?
+4:     b.eq            5f                              // AES-192?
        enc_round       CTR, v17
        enc_round       CTR, v18
-3:     enc_round       CTR, v19
+5:     enc_round       CTR, v19
        enc_round       CTR, v20
-       b               1b
+       b               2b
        .endm
 
        /*
-- 
2.11.0

Reply via email to