Re: [PATCH 4/4] crypto: arm/crct10dif - port x86 SSE implementation to ARM
On 28 November 2016 at 14:17, Herbert Xu wrote: > On Thu, Nov 24, 2016 at 05:32:42PM +, Ard Biesheuvel wrote: >> On 24 November 2016 at 15:43, Ard Biesheuvel >> wrote: >> > This is a straight transliteration of the Intel algorithm implemented >> > using SSE and PCLMULQDQ instructions that resides under in the file >> > arch/x86/crypto/crct10dif-pcl-asm_64.S. >> > >> > Signed-off-by: Ard Biesheuvel >> > --- >> > arch/arm/crypto/Kconfig| 5 + >> > arch/arm/crypto/Makefile | 2 + >> > arch/{arm64 => arm}/crypto/crct10dif-ce-core.S | 457 +++- >> > arch/{arm64 => arm}/crypto/crct10dif-ce-glue.c | 23 +- >> > 4 files changed, 277 insertions(+), 210 deletions(-) >> > >> >> This patch needs the following hunk folded in to avoid breaking the >> Thumb2 build: >> >> """ >> diff --git a/arch/arm/crypto/crct10dif-ce-core.S >> b/arch/arm/crypto/crct10dif-ce-core.S >> index 30168b0f8581..4fdbca94dd0c 100644 >> --- a/arch/arm/crypto/crct10dif-ce-core.S >> +++ b/arch/arm/crypto/crct10dif-ce-core.S >> @@ -152,7 +152,8 @@ CPU_LE( vrev64.8q7, q7 ) >> // XOR the initial_crc value >> veor.8 q0, q0, q10 >> >> - adrlip, rk3 >> +ARM( adrlip, rk3 ) >> +THUMB( adr ip, rk3 ) >> vld1.64 {q10}, [ip] // xmm10 has rk3 and rk4 >> // type of pmull instruction >> // will determine which constant to >> use >> """ > > I'm sorry but this patch doesn't apply on top of the other four. > So please resend the whole series. > Yes, please disregard all CRC ARM/arm64 patches for now, I will consolidate them into a single v2 and send it out after the merge window. -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 4/4] crypto: arm/crct10dif - port x86 SSE implementation to ARM
On Thu, Nov 24, 2016 at 05:32:42PM +, Ard Biesheuvel wrote: > On 24 November 2016 at 15:43, Ard Biesheuvel > wrote: > > This is a straight transliteration of the Intel algorithm implemented > > using SSE and PCLMULQDQ instructions that resides under in the file > > arch/x86/crypto/crct10dif-pcl-asm_64.S. > > > > Signed-off-by: Ard Biesheuvel > > --- > > arch/arm/crypto/Kconfig| 5 + > > arch/arm/crypto/Makefile | 2 + > > arch/{arm64 => arm}/crypto/crct10dif-ce-core.S | 457 +++- > > arch/{arm64 => arm}/crypto/crct10dif-ce-glue.c | 23 +- > > 4 files changed, 277 insertions(+), 210 deletions(-) > > > > This patch needs the following hunk folded in to avoid breaking the > Thumb2 build: > > """ > diff --git a/arch/arm/crypto/crct10dif-ce-core.S > b/arch/arm/crypto/crct10dif-ce-core.S > index 30168b0f8581..4fdbca94dd0c 100644 > --- a/arch/arm/crypto/crct10dif-ce-core.S > +++ b/arch/arm/crypto/crct10dif-ce-core.S > @@ -152,7 +152,8 @@ CPU_LE( vrev64.8q7, q7 ) > // XOR the initial_crc value > veor.8 q0, q0, q10 > > - adrlip, rk3 > +ARM( adrlip, rk3 ) > +THUMB( adr ip, rk3 ) > vld1.64 {q10}, [ip] // xmm10 has rk3 and rk4 > // type of pmull instruction > // will determine which constant to > use > """ I'm sorry but this patch doesn't apply on top of the other four. So please resend the whole series. Thanks, -- Email: Herbert Xu Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 4/4] crypto: arm/crct10dif - port x86 SSE implementation to ARM
On 24 November 2016 at 15:43, Ard Biesheuvel wrote: > This is a straight transliteration of the Intel algorithm implemented > using SSE and PCLMULQDQ instructions that resides under in the file > arch/x86/crypto/crct10dif-pcl-asm_64.S. > > Signed-off-by: Ard Biesheuvel > --- > arch/arm/crypto/Kconfig| 5 + > arch/arm/crypto/Makefile | 2 + > arch/{arm64 => arm}/crypto/crct10dif-ce-core.S | 457 +++- > arch/{arm64 => arm}/crypto/crct10dif-ce-glue.c | 23 +- > 4 files changed, 277 insertions(+), 210 deletions(-) > This patch needs the following hunk folded in to avoid breaking the Thumb2 build: """ diff --git a/arch/arm/crypto/crct10dif-ce-core.S b/arch/arm/crypto/crct10dif-ce-core.S index 30168b0f8581..4fdbca94dd0c 100644 --- a/arch/arm/crypto/crct10dif-ce-core.S +++ b/arch/arm/crypto/crct10dif-ce-core.S @@ -152,7 +152,8 @@ CPU_LE( vrev64.8q7, q7 ) // XOR the initial_crc value veor.8 q0, q0, q10 - adrlip, rk3 +ARM( adrlip, rk3 ) +THUMB( adr ip, rk3 ) vld1.64 {q10}, [ip] // xmm10 has rk3 and rk4 // type of pmull instruction // will determine which constant to use """ Updated patch(es) can be found here https://git.kernel.org/cgit/linux/kernel/git/ardb/linux.git/log/?h=arm-crct10dif -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 4/4] crypto: arm/crct10dif - port x86 SSE implementation to ARM
This is a straight transliteration of the Intel algorithm implemented using SSE and PCLMULQDQ instructions that resides under in the file arch/x86/crypto/crct10dif-pcl-asm_64.S. Signed-off-by: Ard Biesheuvel --- arch/arm/crypto/Kconfig| 5 + arch/arm/crypto/Makefile | 2 + arch/{arm64 => arm}/crypto/crct10dif-ce-core.S | 457 +++- arch/{arm64 => arm}/crypto/crct10dif-ce-glue.c | 23 +- 4 files changed, 277 insertions(+), 210 deletions(-) diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig index 27ed1b1cd1d7..fce801fa52a1 100644 --- a/arch/arm/crypto/Kconfig +++ b/arch/arm/crypto/Kconfig @@ -120,4 +120,9 @@ config CRYPTO_GHASH_ARM_CE that uses the 64x64 to 128 bit polynomial multiplication (vmull.p64) that is part of the ARMv8 Crypto Extensions +config CRYPTO_CRCT10DIF_ARM_CE + tristate "CRCT10DIF digest algorithm using PMULL instructions" + depends on KERNEL_MODE_NEON && CRC_T10DIF + select CRYPTO_HASH + endif diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile index fc5150702b64..fc77265014b7 100644 --- a/arch/arm/crypto/Makefile +++ b/arch/arm/crypto/Makefile @@ -13,6 +13,7 @@ ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o ce-obj-$(CONFIG_CRYPTO_SHA2_ARM_CE) += sha2-arm-ce.o ce-obj-$(CONFIG_CRYPTO_GHASH_ARM_CE) += ghash-arm-ce.o +ce-obj-$(CONFIG_CRYPTO_CRCT10DIF_ARM_CE) += crct10dif-arm-ce.o ifneq ($(ce-obj-y)$(ce-obj-m),) ifeq ($(call as-instr,.fpu crypto-neon-fp-armv8,y,n),y) @@ -36,6 +37,7 @@ sha1-arm-ce-y := sha1-ce-core.o sha1-ce-glue.o sha2-arm-ce-y := sha2-ce-core.o sha2-ce-glue.o aes-arm-ce-y := aes-ce-core.o aes-ce-glue.o ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o +crct10dif-arm-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o quiet_cmd_perl = PERL$@ cmd_perl = $(PERL) $(<) > $(@) diff --git a/arch/arm64/crypto/crct10dif-ce-core.S b/arch/arm/crypto/crct10dif-ce-core.S similarity index 60% copy from arch/arm64/crypto/crct10dif-ce-core.S copy to arch/arm/crypto/crct10dif-ce-core.S index 9148ebd3470a..30168b0f8581 100644 --- a/arch/arm64/crypto/crct10dif-ce-core.S +++ b/arch/arm/crypto/crct10dif-ce-core.S @@ -1,5 +1,5 @@ // -// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions +// Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions // // Copyright (C) 2016 Linaro Ltd // @@ -71,20 +71,43 @@ #include #include - .text - .cpugeneric+crypto - - arg1_low32 .reqw0 - arg2.reqx1 - arg3.reqx2 +#ifdef CONFIG_CPU_ENDIAN_BE8 +#define CPU_LE(code...) +#else +#define CPU_LE(code...)code +#endif - vzr .reqv13 + .text + .fpucrypto-neon-fp-armv8 + + arg1_low32 .reqr0 + arg2.reqr1 + arg3.reqr2 + + qzr .reqq13 + + q0l .reqd0 + q0h .reqd1 + q1l .reqd2 + q1h .reqd3 + q2l .reqd4 + q2h .reqd5 + q3l .reqd6 + q3h .reqd7 + q4l .reqd8 + q4h .reqd9 + q5l .reqd10 + q5h .reqd11 + q6l .reqd12 + q6h .reqd13 + q7l .reqd14 + q7h .reqd15 ENTRY(crc_t10dif_pmull) - stp x29, x30, [sp, #-32]! - mov x29, sp + push{r4, lr} + sub sp, sp, #0x10 - movivzr.16b, #0 // init zero register + vmov.i8 qzr, #0 // init zero register // adjust the 16-bit initial_crc value, scale it to 32 bits lsl arg1_low32, arg1_low32, #16 @@ -93,41 +116,44 @@ ENTRY(crc_t10dif_pmull) cmp arg3, #256 // for sizes less than 128, we can't fold 64B at a time... - b.lt_less_than_128 + blt _less_than_128 // load the initial crc value // crc value does not need to be byte-reflected, but it needs // to be moved to the high part of the register. // because data will be byte-reflected and will align with // initial crc at correct place. - moviv10.16b, #0 - mov v10.s[3], arg1_low32// initial crc + vmovs0, arg1_low32 // initial crc + vext.8 q10, qzr, q0, #4 // receive the initial 64B data, xor the initial crc value - ld1 {v0.2d-v3.2d}, [arg2], #0x40 - ld1 {v4.2d-v7.2d}, [arg2], #0x40 -CPU_LE(rev64 v0.16b, v0.