Improve AES/XTS performance of 6-way unrolling for PowerPC up
to 17% with tcrypt. This is done by using one instruction,
vpermxor, to replace xor and vsldoi.
The same changes were applied to OpenSSL code and a pull request was
submitted.
This patch has been tested with the kernel crypto module tcrypt.ko and
has passed the selftest. The patch is also tested with
CONFIG_CRYPTO_MANAGER_EXTRA_TESTS enabled.
Signed-off-by: Danny Tsen
---
drivers/crypto/vmx/aesp8-ppc.pl | 141 +---
1 file changed, 92 insertions(+), 49 deletions(-)
diff --git a/drivers/crypto/vmx/aesp8-ppc.pl b/drivers/crypto/vmx/aesp8-ppc.pl
index 50a0a18f35da..f729589d792e 100644
--- a/drivers/crypto/vmx/aesp8-ppc.pl
+++ b/drivers/crypto/vmx/aesp8-ppc.pl
@@ -132,11 +132,12 @@ rcon:
.long 0x1b00, 0x1b00, 0x1b00, 0x1b00 ?rev
.long 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c ?rev
.long 0,0,0,0 ?asis
+.long 0x0f102132, 0x43546576, 0x8798a9ba, 0xcbdcedfe
Lconsts:
mflrr0
bcl 20,31,\$+4
mflr$ptr #v "distance between . and rcon
- addi$ptr,$ptr,-0x48
+ addi$ptr,$ptr,-0x58
mtlrr0
blr
.long 0
@@ -2495,6 +2496,17 @@ _aesp8_xts_encrypt6x:
li $x70,0x70
mtspr 256,r0
+ xxlor 2, 32+$eighty7, 32+$eighty7
+ vsldoi $eighty7,$tmp,$eighty7,1# 0x010101..87
+ xxlor 1, 32+$eighty7, 32+$eighty7
+
+ # Load XOR Lconsts.
+ mr $x70, r6
+ bl Lconsts
+ lxvw4x 0, $x40, r6 # load XOR contents
+ mr r6, $x70
+ li $x70,0x70
+
subi$rounds,$rounds,3 # -4 in total
lvx $rndkey0,$x00,$key1 # load key schedule
@@ -2537,69 +2549,77 @@ Load_xts_enc_key:
?vperm v31,v31,$twk5,$keyperm
lvx v25,$x10,$key_ # pre-load round[2]
+ # Switch to use the following codes with 0x010101..87 to generate tweak.
+ # eighty7 = 0x010101..87
+ # vsrab tmp, tweak, seven # next tweak value, right shift
7 bits
+ # vand tmp, tmp, eighty7 # last byte with carry
+ # vaddubm tweak, tweak, tweak # left shift 1 bit (x2)
+ # xxlor vsx, 0, 0
+ # vpermxor tweak, tweak, tmp, vsx
+
vperm $in0,$inout,$inptail,$inpperm
subi $inp,$inp,31# undo "caller"
vxor$twk0,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak
- vsldoi $tmp,$tmp,$tmp,15
vand$tmp,$tmp,$eighty7
vxor $out0,$in0,$twk0
- vxor$tweak,$tweak,$tmp
+ xxlor 32+$in1, 0, 0
+ vpermxor$tweak, $tweak, $tmp, $in1
lvx_u $in1,$x10,$inp
vxor$twk1,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak
- vsldoi $tmp,$tmp,$tmp,15
le?vperm $in1,$in1,$in1,$leperm
vand$tmp,$tmp,$eighty7
vxor $out1,$in1,$twk1
- vxor$tweak,$tweak,$tmp
+ xxlor 32+$in2, 0, 0
+ vpermxor$tweak, $tweak, $tmp, $in2
lvx_u $in2,$x20,$inp
andi. $taillen,$len,15
vxor$twk2,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak
- vsldoi $tmp,$tmp,$tmp,15
le?vperm $in2,$in2,$in2,$leperm
vand$tmp,$tmp,$eighty7
vxor $out2,$in2,$twk2
- vxor$tweak,$tweak,$tmp
+ xxlor 32+$in3, 0, 0
+ vpermxor$tweak, $tweak, $tmp, $in3
lvx_u $in3,$x30,$inp
sub$len,$len,$taillen
vxor$twk3,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak
- vsldoi $tmp,$tmp,$tmp,15
le?vperm $in3,$in3,$in3,$leperm
vand$tmp,$tmp,$eighty7
vxor $out3,$in3,$twk3
- vxor$tweak,$tweak,$tmp
+ xxlor 32+$in4, 0, 0
+ vpermxor$tweak, $tweak, $tmp, $in4
lvx_u $in4,$x40,$inp
subi $len,$len,0x60
vxor$twk4,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak
- vsldoi $tmp,$tmp,$tmp,15
le?vperm $in4,$in4,$in4,$leperm
vand$tmp,$tmp,$eighty