Re: [PATCH 4/4] crypto: arm/crct10dif - port x86 SSE implementation to ARM

2016-11-28 Thread Ard Biesheuvel
On 28 November 2016 at 14:17, Herbert Xu  wrote:
> On Thu, Nov 24, 2016 at 05:32:42PM +, Ard Biesheuvel wrote:
>> On 24 November 2016 at 15:43, Ard Biesheuvel  
>> wrote:
>> > This is a straight transliteration of the Intel algorithm implemented
>> > using SSE and PCLMULQDQ instructions that resides under in the file
>> > arch/x86/crypto/crct10dif-pcl-asm_64.S.
>> >
>> > Signed-off-by: Ard Biesheuvel 
>> > ---
>> >  arch/arm/crypto/Kconfig|   5 +
>> >  arch/arm/crypto/Makefile   |   2 +
>> >  arch/{arm64 => arm}/crypto/crct10dif-ce-core.S | 457 +++-
>> >  arch/{arm64 => arm}/crypto/crct10dif-ce-glue.c |  23 +-
>> >  4 files changed, 277 insertions(+), 210 deletions(-)
>> >
>>
>> This patch needs the following hunk folded in to avoid breaking the
>> Thumb2 build:
>>
>> """
>> diff --git a/arch/arm/crypto/crct10dif-ce-core.S
>> b/arch/arm/crypto/crct10dif-ce-core.S
>> index 30168b0f8581..4fdbca94dd0c 100644
>> --- a/arch/arm/crypto/crct10dif-ce-core.S
>> +++ b/arch/arm/crypto/crct10dif-ce-core.S
>> @@ -152,7 +152,8 @@ CPU_LE( vrev64.8q7, q7  )
>> // XOR the initial_crc value
>> veor.8  q0, q0, q10
>>
>> -   adrlip, rk3
>> +ARM(   adrlip, rk3 )
>> +THUMB( adr ip, rk3 )
>> vld1.64 {q10}, [ip] // xmm10 has rk3 and rk4
>> // type of pmull instruction
>> // will determine which constant to 
>> use
>> """
>
> I'm sorry but this patch doesn't apply on top of the other four.
> So please resend the whole series.
>

Yes, please disregard all CRC ARM/arm64 patches for now, I will
consolidate them into a single v2 and send it out after the merge
window.
--
To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4/4] crypto: arm/crct10dif - port x86 SSE implementation to ARM

2016-11-28 Thread Herbert Xu
On Thu, Nov 24, 2016 at 05:32:42PM +, Ard Biesheuvel wrote:
> On 24 November 2016 at 15:43, Ard Biesheuvel  
> wrote:
> > This is a straight transliteration of the Intel algorithm implemented
> > using SSE and PCLMULQDQ instructions that resides under in the file
> > arch/x86/crypto/crct10dif-pcl-asm_64.S.
> >
> > Signed-off-by: Ard Biesheuvel 
> > ---
> >  arch/arm/crypto/Kconfig|   5 +
> >  arch/arm/crypto/Makefile   |   2 +
> >  arch/{arm64 => arm}/crypto/crct10dif-ce-core.S | 457 +++-
> >  arch/{arm64 => arm}/crypto/crct10dif-ce-glue.c |  23 +-
> >  4 files changed, 277 insertions(+), 210 deletions(-)
> >
> 
> This patch needs the following hunk folded in to avoid breaking the
> Thumb2 build:
> 
> """
> diff --git a/arch/arm/crypto/crct10dif-ce-core.S
> b/arch/arm/crypto/crct10dif-ce-core.S
> index 30168b0f8581..4fdbca94dd0c 100644
> --- a/arch/arm/crypto/crct10dif-ce-core.S
> +++ b/arch/arm/crypto/crct10dif-ce-core.S
> @@ -152,7 +152,8 @@ CPU_LE( vrev64.8q7, q7  )
> // XOR the initial_crc value
> veor.8  q0, q0, q10
> 
> -   adrlip, rk3
> +ARM(   adrlip, rk3 )
> +THUMB( adr ip, rk3 )
> vld1.64 {q10}, [ip] // xmm10 has rk3 and rk4
> // type of pmull instruction
> // will determine which constant to 
> use
> """

I'm sorry but this patch doesn't apply on top of the other four.
So please resend the whole series.

Thanks,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
--
To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4/4] crypto: arm/crct10dif - port x86 SSE implementation to ARM

2016-11-24 Thread Ard Biesheuvel
On 24 November 2016 at 15:43, Ard Biesheuvel  wrote:
> This is a straight transliteration of the Intel algorithm implemented
> using SSE and PCLMULQDQ instructions that resides under in the file
> arch/x86/crypto/crct10dif-pcl-asm_64.S.
>
> Signed-off-by: Ard Biesheuvel 
> ---
>  arch/arm/crypto/Kconfig|   5 +
>  arch/arm/crypto/Makefile   |   2 +
>  arch/{arm64 => arm}/crypto/crct10dif-ce-core.S | 457 +++-
>  arch/{arm64 => arm}/crypto/crct10dif-ce-glue.c |  23 +-
>  4 files changed, 277 insertions(+), 210 deletions(-)
>

This patch needs the following hunk folded in to avoid breaking the
Thumb2 build:

"""
diff --git a/arch/arm/crypto/crct10dif-ce-core.S
b/arch/arm/crypto/crct10dif-ce-core.S
index 30168b0f8581..4fdbca94dd0c 100644
--- a/arch/arm/crypto/crct10dif-ce-core.S
+++ b/arch/arm/crypto/crct10dif-ce-core.S
@@ -152,7 +152,8 @@ CPU_LE( vrev64.8q7, q7  )
// XOR the initial_crc value
veor.8  q0, q0, q10

-   adrlip, rk3
+ARM(   adrlip, rk3 )
+THUMB( adr ip, rk3 )
vld1.64 {q10}, [ip] // xmm10 has rk3 and rk4
// type of pmull instruction
// will determine which constant to use
"""

Updated patch(es) can be found here
https://git.kernel.org/cgit/linux/kernel/git/ardb/linux.git/log/?h=arm-crct10dif
--
To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 4/4] crypto: arm/crct10dif - port x86 SSE implementation to ARM

2016-11-24 Thread Ard Biesheuvel
This is a straight transliteration of the Intel algorithm implemented
using SSE and PCLMULQDQ instructions that resides under in the file
arch/x86/crypto/crct10dif-pcl-asm_64.S.

Signed-off-by: Ard Biesheuvel 
---
 arch/arm/crypto/Kconfig|   5 +
 arch/arm/crypto/Makefile   |   2 +
 arch/{arm64 => arm}/crypto/crct10dif-ce-core.S | 457 +++-
 arch/{arm64 => arm}/crypto/crct10dif-ce-glue.c |  23 +-
 4 files changed, 277 insertions(+), 210 deletions(-)

diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index 27ed1b1cd1d7..fce801fa52a1 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -120,4 +120,9 @@ config CRYPTO_GHASH_ARM_CE
  that uses the 64x64 to 128 bit polynomial multiplication (vmull.p64)
  that is part of the ARMv8 Crypto Extensions
 
+config CRYPTO_CRCT10DIF_ARM_CE
+   tristate "CRCT10DIF digest algorithm using PMULL instructions"
+   depends on KERNEL_MODE_NEON && CRC_T10DIF
+   select CRYPTO_HASH
+
 endif
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index fc5150702b64..fc77265014b7 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -13,6 +13,7 @@ ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
 ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
 ce-obj-$(CONFIG_CRYPTO_SHA2_ARM_CE) += sha2-arm-ce.o
 ce-obj-$(CONFIG_CRYPTO_GHASH_ARM_CE) += ghash-arm-ce.o
+ce-obj-$(CONFIG_CRYPTO_CRCT10DIF_ARM_CE) += crct10dif-arm-ce.o
 
 ifneq ($(ce-obj-y)$(ce-obj-m),)
 ifeq ($(call as-instr,.fpu crypto-neon-fp-armv8,y,n),y)
@@ -36,6 +37,7 @@ sha1-arm-ce-y := sha1-ce-core.o sha1-ce-glue.o
 sha2-arm-ce-y  := sha2-ce-core.o sha2-ce-glue.o
 aes-arm-ce-y   := aes-ce-core.o aes-ce-glue.o
 ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o
+crct10dif-arm-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o
 
 quiet_cmd_perl = PERL$@
   cmd_perl = $(PERL) $(<) > $(@)
diff --git a/arch/arm64/crypto/crct10dif-ce-core.S 
b/arch/arm/crypto/crct10dif-ce-core.S
similarity index 60%
copy from arch/arm64/crypto/crct10dif-ce-core.S
copy to arch/arm/crypto/crct10dif-ce-core.S
index 9148ebd3470a..30168b0f8581 100644
--- a/arch/arm64/crypto/crct10dif-ce-core.S
+++ b/arch/arm/crypto/crct10dif-ce-core.S
@@ -1,5 +1,5 @@
 //
-// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
+// Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions
 //
 // Copyright (C) 2016 Linaro Ltd 
 //
@@ -71,20 +71,43 @@
 #include 
 #include 
 
-   .text
-   .cpugeneric+crypto
-
-   arg1_low32  .reqw0
-   arg2.reqx1
-   arg3.reqx2
+#ifdef CONFIG_CPU_ENDIAN_BE8
+#define CPU_LE(code...)
+#else
+#define CPU_LE(code...)code
+#endif
 
-   vzr .reqv13
+   .text
+   .fpucrypto-neon-fp-armv8
+
+   arg1_low32  .reqr0
+   arg2.reqr1
+   arg3.reqr2
+
+   qzr .reqq13
+
+   q0l .reqd0
+   q0h .reqd1
+   q1l .reqd2
+   q1h .reqd3
+   q2l .reqd4
+   q2h .reqd5
+   q3l .reqd6
+   q3h .reqd7
+   q4l .reqd8
+   q4h .reqd9
+   q5l .reqd10
+   q5h .reqd11
+   q6l .reqd12
+   q6h .reqd13
+   q7l .reqd14
+   q7h .reqd15
 
 ENTRY(crc_t10dif_pmull)
-   stp x29, x30, [sp, #-32]!
-   mov x29, sp
+   push{r4, lr}
+   sub sp, sp, #0x10
 
-   movivzr.16b, #0 // init zero register
+   vmov.i8 qzr, #0 // init zero register
 
// adjust the 16-bit initial_crc value, scale it to 32 bits
lsl arg1_low32, arg1_low32, #16
@@ -93,41 +116,44 @@ ENTRY(crc_t10dif_pmull)
cmp arg3, #256
 
// for sizes less than 128, we can't fold 64B at a time...
-   b.lt_less_than_128
+   blt _less_than_128
 
// load the initial crc value
// crc value does not need to be byte-reflected, but it needs
// to be moved to the high part of the register.
// because data will be byte-reflected and will align with
// initial crc at correct place.
-   moviv10.16b, #0
-   mov v10.s[3], arg1_low32// initial crc
+   vmovs0, arg1_low32  // initial crc
+   vext.8  q10, qzr, q0, #4
 
// receive the initial 64B data, xor the initial crc value
-   ld1 {v0.2d-v3.2d}, [arg2], #0x40
-   ld1 {v4.2d-v7.2d}, [arg2], #0x40
-CPU_LE(rev64   v0.16b, v0.