>>>> arm-linux-gnueabi-gcc -I.. -I../.. -I../modes -I../asn1 -I../evp
>>>> -I../../include -DOPENSSL_THREADS -D_REENTRANT -DDSO_DLFCN
>>>> -DHAVE_DLFCN_H -D__ARM_MAX_ARCH__=8 -DTERMIO -O3 -Wall
>>>> -DOPENSSL_BN_ASM_MONT -DOPENSSL_BN_ASM_GF2m -DSHA1_ASM -DSHA256_ASM
>>>> -DSHA512_ASM -DAES_ASM -DBSAES_ASM -DGHASH_ASM -c -c -o aesv8-armx.o
>>>> aesv8-armx.S
>>>> aesv8-armx.S: Assembler messages:
>>>> aesv8-armx.S:574: Error: selected processor does not support ARM mode
>>>> `rev r8,r8'
>>>> make[2]: *** [aesv8-armx.o] Error 1
>>>>
>>
>> Attached is version that attempts to exploit the fact that .arch
>> directive appears to be positional and add .arch armv7-a everywhere,
>> just not in the beginnings of mixed-code files. For this I had to
>> reorder code paths in armv4cpuid and armv4-gfm modules.
>>
>
> I have tested this with the stock Ubuntu ARM EABI soft float toolchain
> (which targets for armv5t non-Thumb), and added the max arch == 8, and
> it builds fine and produces a binary that will use NEON or crypto
> instructions if the cpu's capabilities allow it. (Tested on 32-bit and
> 64-bit ARM systems)
¡Thank you very much! Attached is promised patch that reworks
interworking logic. As mentioned earlier idea is to use __ARM_ARCH__>=5
|| !defined(__thumb__). Rationale is that load to pc does interworking
since ARMv5, but without __thumb__ it does what we need even on ARMv4.
diff --git a/crypto/aes/asm/aes-armv4.pl b/crypto/aes/asm/aes-armv4.pl
index a620a7c..e242a6a 100644
--- a/crypto/aes/asm/aes-armv4.pl
+++ b/crypto/aes/asm/aes-armv4.pl
@@ -275,13 +275,11 @@ AES_encrypt:
strb $t3,[$rounds,#14]
strb $s3,[$rounds,#15]
#endif
-#if __ARM_ARCH__>=5
+#if __ARM_ARCH__>=5 || !defined(__thumb__)
ldmia sp!,{r4-r12,pc}
#else
ldmia sp!,{r4-r12,lr}
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- bx lr @ interoperable with Thumb ISA:-)
+ bx lr @ interwork with v4t:-)
#endif
.size AES_encrypt,.-AES_encrypt
@@ -715,12 +713,10 @@ _armv4_AES_set_encrypt_key:
.Ldone: mov r0,#0
ldmia sp!,{r4-r12,lr}
.Labrt:
-#if __ARM_ARCH__>=5
- ret @ bx lr
+#if __ARM_ARCH__>=5 || defined(__thumb__)
+ bx lr
#else
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- bx lr @ interoperable with Thumb ISA:-)
+ mov pc,lr
#endif
.size AES_set_encrypt_key,.-AES_set_encrypt_key
@@ -831,13 +827,11 @@ $code.=<<___;
bne .Lmix
mov r0,#0
-#if __ARM_ARCH__>=5
+#if __ARM_ARCH__>=5 || !defined(__thumb__)
ldmia sp!,{r4-r12,pc}
#else
ldmia sp!,{r4-r12,lr}
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- bx lr @ interoperable with Thumb ISA:-)
+ bx lr @ interwork with v4t:-)
#endif
.size AES_set_enc2dec_key,.-AES_set_enc2dec_key
@@ -1043,13 +1037,11 @@ AES_decrypt:
strb $t3,[$rounds,#14]
strb $s3,[$rounds,#15]
#endif
-#if __ARM_ARCH__>=5
+#if __ARM_ARCH__>=5 || !defined(__thumb__)
ldmia sp!,{r4-r12,pc}
#else
ldmia sp!,{r4-r12,lr}
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- bx lr @ interoperable with Thumb ISA:-)
+ bx lr @ interwork with v4t:-)
#endif
.size AES_decrypt,.-AES_decrypt
@@ -1202,9 +1194,6 @@ _armv4_AES_decrypt:
.align 2
___
-$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
-$code =~ s/\bret\b/bx\tlr/gm;
-
open SELF,$0;
while(<SELF>) {
next if (/^#!/);
diff --git a/crypto/armv4cpuid.S b/crypto/armv4cpuid.S
index 65010ae..b37d0f6 100644
--- a/crypto/armv4cpuid.S
+++ b/crypto/armv4cpuid.S
@@ -33,10 +33,12 @@ OPENSSL_atomic_add:
add r2,r2,r5
str r2,[r4]
str r0,[r6] @ release spinlock
+#if __ARM_ARCH__>=5 || !defined(__thumb__)
+ ldmia sp!,{r4-r6,pc}
+#else
ldmia sp!,{r4-r6,lr}
- tst lr,#1
- moveq pc,lr
- .word 0xe12fff1e @ bx lr
+ bx lr
+#endif
#endif
.size OPENSSL_atomic_add,.-OPENSSL_atomic_add
@@ -67,12 +69,10 @@ OPENSSL_cleanse:
adds r1,r1,#4
bne .Little
.Lcleanse_done:
-#if __ARM_ARCH__>=5
+#if __ARM_ARCH__>=5 || defined(__thumb__)
bx lr
#else
- tst lr,#1
- moveq pc,lr
- .word 0xe12fff1e @ bx lr
+ mov pc,lr
#endif
.size OPENSSL_cleanse,.-OPENSSL_cleanse
@@ -152,12 +152,10 @@ OPENSSL_wipe_cpu:
.Lwipe_done:
#endif
mov r0,sp
-#if __ARM_ARCH__>=5
+#if __ARM_ARCH__>=5 || defined(__thumb__)
bx lr
#else
- tst lr,#1
- moveq pc,lr
- .word 0xe12fff1e @ bx lr
+ mov pc,lr
#endif
.size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
@@ -165,12 +163,10 @@ OPENSSL_wipe_cpu:
.type OPENSSL_instrument_bus,%function
OPENSSL_instrument_bus:
eor r0,r0,r0
-#if __ARM_ARCH__>=5
+#if __ARM_ARCH__>=5 || defined(__thumb__)
bx lr
#else
- tst lr,#1
- moveq pc,lr
- .word 0xe12fff1e @ bx lr
+ mov pc,lr
#endif
.size OPENSSL_instrument_bus,.-OPENSSL_instrument_bus
@@ -178,12 +174,10 @@ OPENSSL_instrument_bus:
.type OPENSSL_instrument_bus2,%function
OPENSSL_instrument_bus2:
eor r0,r0,r0
-#if __ARM_ARCH__>=5
+#if __ARM_ARCH__>=5 || defined(__thumb__)
bx lr
#else
- tst lr,#1
- moveq pc,lr
- .word 0xe12fff1e @ bx lr
+ mov pc,lr
#endif
.size OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2
diff --git a/crypto/bn/asm/armv4-gf2m.pl b/crypto/bn/asm/armv4-gf2m.pl
index 8f529c9..3a3c21e 100644
--- a/crypto/bn/asm/armv4-gf2m.pl
+++ b/crypto/bn/asm/armv4-gf2m.pl
@@ -191,13 +191,11 @@ $code.=<<___;
add sp,sp,#32 @ destroy tab[8]
str $lo,[$ret,#4]
-#if __ARM_ARCH__>=5
+#if __ARM_ARCH__>=5 || !defined(__thumb__)
ldmia sp!,{r4-r10,pc}
#else
ldmia sp!,{r4-r10,lr}
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- bx lr @ interoperable with Thumb ISA:-)
+ bx lr @ interwork with v4t:-)
#endif
___
}
@@ -258,7 +256,7 @@ $code.=<<___;
veor $r, $r, $t2
vst1.32 {$r}, [r0]
- ret @ bx lr
+ bx lr
#endif
___
}
@@ -280,9 +278,7 @@ ___
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/geo;
- s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
- s/\bret\b/bx lr/go or
- s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
+ s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
print $_,"\n";
}
diff --git a/crypto/bn/asm/armv4-mont.pl b/crypto/bn/asm/armv4-mont.pl
index 1d330e9..ccce5ed 100644
--- a/crypto/bn/asm/armv4-mont.pl
+++ b/crypto/bn/asm/armv4-mont.pl
@@ -231,12 +231,10 @@ bn_mul_mont:
add sp,sp,#2*4 @ skip over {r0,r2}
mov r0,#1
.Labrt:
-#if __ARM_ARCH__>=5
- ret @ bx lr
+#if __ARM_ARCH__>=5 || defined(__thumb__)
+ bx lr
#else
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- bx lr @ interoperable with Thumb ISA:-)
+ mov pc,lr
#endif
.size bn_mul_mont,.-bn_mul_mont
___
@@ -656,7 +654,7 @@ bn_mul8x_mont_neon:
sub sp,ip,#96
vldmia sp!,{d8-d15}
ldmia sp!,{r4-r11}
- ret @ bx lr
+ bx lr
.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
#endif
___
@@ -670,7 +668,5 @@ $code.=<<___;
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
-$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
-$code =~ s/\bret\b/bx lr/gm;
print $code;
close STDOUT;
diff --git a/crypto/modes/asm/ghash-armv4.pl b/crypto/modes/asm/ghash-armv4.pl
index 77fbf34..26cf761 100644
--- a/crypto/modes/asm/ghash-armv4.pl
+++ b/crypto/modes/asm/ghash-armv4.pl
@@ -229,13 +229,11 @@ $code.=<<___;
bne .Louter
add sp,sp,#36
-#if __ARM_ARCH__>=5
+#if __ARM_ARCH__>=5 || !defined(__thumb__)
ldmia sp!,{r4-r11,pc}
#else
ldmia sp!,{r4-r11,lr}
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- bx lr @ interoperable with Thumb ISA:-)
+ bx lr @ interwork with v4t:-)
#endif
.size gcm_ghash_4bit,.-gcm_ghash_4bit
@@ -306,13 +304,11 @@ gcm_gmult_4bit:
___
&Zsmash();
$code.=<<___;
-#if __ARM_ARCH__>=5
+#if __ARM_ARCH__>=5 || !defined(__thumb__)
ldmia sp!,{r4-r11,pc}
#else
ldmia sp!,{r4-r11,lr}
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- bx lr @ interoperable with Thumb ISA:-)
+ bx lr @ interwork with v4t:-)
#endif
.size gcm_gmult_4bit,.-gcm_gmult_4bit
___
@@ -387,7 +383,7 @@ gcm_init_neon:
veor $IN,$IN,$t0 @ twisted H
vstmia r0,{$IN}
- ret @ bx lr
+ bx lr
.size gcm_init_neon,.-gcm_init_neon
.global gcm_gmult_neon
@@ -471,7 +467,7 @@ $code.=<<___;
vst1.64 $Xl#hi,[$Xi,:64]! @ write out Xi
vst1.64 $Xl#lo,[$Xi,:64]
- ret @ bx lr
+ bx lr
.size gcm_ghash_neon,.-gcm_ghash_neon
#endif
___
@@ -484,9 +480,7 @@ ___
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/geo;
- s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
- s/\bret\b/bx lr/go or
- s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
+ s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
print $_,"\n";
}
diff --git a/crypto/sha/asm/sha1-armv4-large.pl b/crypto/sha/asm/sha1-armv4-large.pl
index b2c3032..982cc94 100644
--- a/crypto/sha/asm/sha1-armv4-large.pl
+++ b/crypto/sha/asm/sha1-armv4-large.pl
@@ -249,13 +249,11 @@ $code.=<<___;
teq $inp,$len
bne .Lloop @ [+18], total 1307
-#if __ARM_ARCH__>=5
+#if __ARM_ARCH__>=5 || !defined(__thumb__)
ldmia sp!,{r4-r12,pc}
#else
ldmia sp!,{r4-r12,lr}
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- bx lr @ interoperable with Thumb ISA:-)
+ bx lr @ interwork with v4t:-)
#endif
.size sha1_block_data_order,.-sha1_block_data_order
@@ -634,7 +632,7 @@ $code.=<<___;
vst1.32 {$E\[0]},[$ctx]
vldmia sp!,{d8-d15}
- ret @ bx lr
+ bx lr
.size sha1_block_data_order_armv8,.-sha1_block_data_order_armv8
#endif
___
@@ -674,9 +672,6 @@ foreach (split($/,$code)) {
s/\b(sha1\w+)\s+(q.*)/unsha1($1,$2)/geo;
- s/\bret\b/bx lr/o or
- s/\bbx\s+lr\b/.word\t0xe12fff1e/o; # make it possible to compile with -march=armv4
-
print $_,$/;
}
diff --git a/crypto/sha/asm/sha256-armv4.pl b/crypto/sha/asm/sha256-armv4.pl
index b0ae936..d0b6a66 100644
--- a/crypto/sha/asm/sha256-armv4.pl
+++ b/crypto/sha/asm/sha256-armv4.pl
@@ -241,13 +241,11 @@ $code.=<<___;
bne .Loop
add sp,sp,#`16+3`*4 @ destroy frame
-#if __ARM_ARCH__>=5
+#if __ARM_ARCH__>=5 || !defined(__thumb__)
ldmia sp!,{r4-r11,pc}
#else
ldmia sp!,{r4-r11,lr}
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- bx lr @ interoperable with Thumb ISA:-)
+ bx lr @ interwork with v4t:-)
#endif
.size sha256_block_data_order,.-sha256_block_data_order
___
@@ -611,7 +609,7 @@ $code.=<<___;
vst1.32 {$ABCD,$EFGH},[$ctx]
- ret @ bx lr
+ bx lr
.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
#endif
___
@@ -652,9 +650,6 @@ foreach (split($/,$code)) {
s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
- s/\bret\b/bx lr/go or
- s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
-
print $_,"\n";
}
diff --git a/crypto/sha/asm/sha512-armv4.pl b/crypto/sha/asm/sha512-armv4.pl
index fb7dc50..56ffcb9 100644
--- a/crypto/sha/asm/sha512-armv4.pl
+++ b/crypto/sha/asm/sha512-armv4.pl
@@ -445,13 +445,11 @@ $code.=<<___;
bne .Loop
add sp,sp,#8*9 @ destroy frame
-#if __ARM_ARCH__>=5
+#if __ARM_ARCH__>=5 || !defined(__thumb__)
ldmia sp!,{r4-r12,pc}
#else
ldmia sp!,{r4-r12,lr}
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- bx lr @ interoperable with Thumb ISA:-)
+ bx lr @ interwork with v4t:-)
#endif
___
@@ -589,7 +587,7 @@ $code.=<<___;
bne .Loop_neon
vldmia sp!,{d8-d15} @ epilogue
- ret @ bx lr
+ bx lr
#endif
___
}
@@ -603,7 +601,5 @@ $code.=<<___;
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
-$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
-$code =~ s/\bret\b/bx lr/gm;
print $code;
close STDOUT; # enforce flush