Re: [PATCH v2] crypto: vmx: Improved AES/XTS performance of 6-way unrolling for ppc.

2023-09-15 Thread Danny Tsen

Still waiting for the CCLA to send to Openssl.

Thanks.

-Danny

On 9/15/23 8:29 AM, Michael Ellerman wrote:

Danny Tsen  writes:

Improve AES/XTS performance of 6-way unrolling for PowerPC up
to 17% with tcrypt.  This is done by using one instruction,
vpermxor, to replace xor and vsldoi.

The same changes were applied to OpenSSL code and a pull request was
submitted.

https://github.com/openssl/openssl/pull/21812

Still unmerged as of today.

cheers


Re: [PATCH v2] crypto: vmx: Improved AES/XTS performance of 6-way unrolling for ppc.

2023-09-15 Thread Michael Ellerman
Danny Tsen  writes:
> Improve AES/XTS performance of 6-way unrolling for PowerPC up
> to 17% with tcrypt.  This is done by using one instruction,
> vpermxor, to replace xor and vsldoi.
>
> The same changes were applied to OpenSSL code and a pull request was
> submitted.

https://github.com/openssl/openssl/pull/21812

Still unmerged as of today.

cheers


Re: [PATCH v2] crypto: vmx: Improved AES/XTS performance of 6-way unrolling for ppc.

2023-09-15 Thread Danny Tsen

Thanks Herbert.

-Danny

On 9/15/23 5:41 AM, Herbert Xu wrote:

On Wed, Aug 30, 2023 at 09:49:11AM -0400, Danny Tsen wrote:

Improve AES/XTS performance of 6-way unrolling for PowerPC up
to 17% with tcrypt.  This is done by using one instruction,
vpermxor, to replace xor and vsldoi.

The same changes were applied to OpenSSL code and a pull request was
submitted.

This patch has been tested with the kernel crypto module tcrypt.ko and
has passed the selftest.  The patch is also tested with
CONFIG_CRYPTO_MANAGER_EXTRA_TESTS enabled.

Signed-off-by: Danny Tsen 
---
  drivers/crypto/vmx/aesp8-ppc.pl | 141 +---
  1 file changed, 92 insertions(+), 49 deletions(-)

Patch applied.  Thanks.


Re: [PATCH v2] crypto: vmx: Improved AES/XTS performance of 6-way unrolling for ppc.

2023-09-15 Thread Herbert Xu
On Wed, Aug 30, 2023 at 09:49:11AM -0400, Danny Tsen wrote:
> Improve AES/XTS performance of 6-way unrolling for PowerPC up
> to 17% with tcrypt.  This is done by using one instruction,
> vpermxor, to replace xor and vsldoi.
> 
> The same changes were applied to OpenSSL code and a pull request was
> submitted.
> 
> This patch has been tested with the kernel crypto module tcrypt.ko and
> has passed the selftest.  The patch is also tested with
> CONFIG_CRYPTO_MANAGER_EXTRA_TESTS enabled.
> 
> Signed-off-by: Danny Tsen 
> ---
>  drivers/crypto/vmx/aesp8-ppc.pl | 141 +---
>  1 file changed, 92 insertions(+), 49 deletions(-)

Patch applied.  Thanks.
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


[PATCH v2] crypto: vmx: Improved AES/XTS performance of 6-way unrolling for ppc.

2023-08-30 Thread Danny Tsen
Improve AES/XTS performance of 6-way unrolling for PowerPC up
to 17% with tcrypt.  This is done by using one instruction,
vpermxor, to replace xor and vsldoi.

The same changes were applied to OpenSSL code and a pull request was
submitted.

This patch has been tested with the kernel crypto module tcrypt.ko and
has passed the selftest.  The patch is also tested with
CONFIG_CRYPTO_MANAGER_EXTRA_TESTS enabled.

Signed-off-by: Danny Tsen 
---
 drivers/crypto/vmx/aesp8-ppc.pl | 141 +---
 1 file changed, 92 insertions(+), 49 deletions(-)

diff --git a/drivers/crypto/vmx/aesp8-ppc.pl b/drivers/crypto/vmx/aesp8-ppc.pl
index 50a0a18f35da..f729589d792e 100644
--- a/drivers/crypto/vmx/aesp8-ppc.pl
+++ b/drivers/crypto/vmx/aesp8-ppc.pl
@@ -132,11 +132,12 @@ rcon:
 .long  0x1b00, 0x1b00, 0x1b00, 0x1b00  ?rev
 .long  0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c  ?rev
 .long  0,0,0,0 ?asis
+.long  0x0f102132, 0x43546576, 0x8798a9ba, 0xcbdcedfe
 Lconsts:
mflrr0
bcl 20,31,\$+4
mflr$ptr #v "distance between . and rcon
-   addi$ptr,$ptr,-0x48
+   addi$ptr,$ptr,-0x58
mtlrr0
blr
.long   0
@@ -2495,6 +2496,17 @@ _aesp8_xts_encrypt6x:
li  $x70,0x70
mtspr   256,r0
 
+   xxlor   2, 32+$eighty7, 32+$eighty7
+   vsldoi  $eighty7,$tmp,$eighty7,1# 0x010101..87
+   xxlor   1, 32+$eighty7, 32+$eighty7
+
+   # Load XOR Lconsts.
+   mr  $x70, r6
+   bl  Lconsts
+   lxvw4x  0, $x40, r6 # load XOR contents
+   mr  r6, $x70
+   li  $x70,0x70
+
subi$rounds,$rounds,3   # -4 in total
 
lvx $rndkey0,$x00,$key1 # load key schedule
@@ -2537,69 +2549,77 @@ Load_xts_enc_key:
?vperm  v31,v31,$twk5,$keyperm
lvx v25,$x10,$key_  # pre-load round[2]
 
+   # Switch to use the following codes with 0x010101..87 to generate tweak.
+   # eighty7 = 0x010101..87
+   # vsrab tmp, tweak, seven   # next tweak value, right shift 
7 bits
+   # vand  tmp, tmp, eighty7   # last byte with carry
+   # vaddubm   tweak, tweak, tweak # left shift 1 bit (x2)
+   # xxlor vsx, 0, 0
+   # vpermxor  tweak, tweak, tmp, vsx
+
 vperm  $in0,$inout,$inptail,$inpperm
 subi   $inp,$inp,31# undo "caller"
vxor$twk0,$tweak,$rndkey0
vsrab   $tmp,$tweak,$seven  # next tweak value
vaddubm $tweak,$tweak,$tweak
-   vsldoi  $tmp,$tmp,$tmp,15
vand$tmp,$tmp,$eighty7
 vxor   $out0,$in0,$twk0
-   vxor$tweak,$tweak,$tmp
+   xxlor   32+$in1, 0, 0
+   vpermxor$tweak, $tweak, $tmp, $in1
 
 lvx_u  $in1,$x10,$inp
vxor$twk1,$tweak,$rndkey0
vsrab   $tmp,$tweak,$seven  # next tweak value
vaddubm $tweak,$tweak,$tweak
-   vsldoi  $tmp,$tmp,$tmp,15
 le?vperm   $in1,$in1,$in1,$leperm
vand$tmp,$tmp,$eighty7
 vxor   $out1,$in1,$twk1
-   vxor$tweak,$tweak,$tmp
+   xxlor   32+$in2, 0, 0
+   vpermxor$tweak, $tweak, $tmp, $in2
 
 lvx_u  $in2,$x20,$inp
 andi.  $taillen,$len,15
vxor$twk2,$tweak,$rndkey0
vsrab   $tmp,$tweak,$seven  # next tweak value
vaddubm $tweak,$tweak,$tweak
-   vsldoi  $tmp,$tmp,$tmp,15
 le?vperm   $in2,$in2,$in2,$leperm
vand$tmp,$tmp,$eighty7
 vxor   $out2,$in2,$twk2
-   vxor$tweak,$tweak,$tmp
+   xxlor   32+$in3, 0, 0
+   vpermxor$tweak, $tweak, $tmp, $in3
 
 lvx_u  $in3,$x30,$inp
 sub$len,$len,$taillen
vxor$twk3,$tweak,$rndkey0
vsrab   $tmp,$tweak,$seven  # next tweak value
vaddubm $tweak,$tweak,$tweak
-   vsldoi  $tmp,$tmp,$tmp,15
 le?vperm   $in3,$in3,$in3,$leperm
vand$tmp,$tmp,$eighty7
 vxor   $out3,$in3,$twk3
-   vxor$tweak,$tweak,$tmp
+   xxlor   32+$in4, 0, 0
+   vpermxor$tweak, $tweak, $tmp, $in4
 
 lvx_u  $in4,$x40,$inp
 subi   $len,$len,0x60
vxor$twk4,$tweak,$rndkey0
vsrab   $tmp,$tweak,$seven  # next tweak value
vaddubm $tweak,$tweak,$tweak
-   vsldoi  $tmp,$tmp,$tmp,15
 le?vperm   $in4,$in4,$in4,$leperm
vand$tmp,$tmp,$eighty