>>>> Attached is promised patch that reworks >>>> interworking logic. As mentioned earlier idea is to use __ARM_ARCH__>=5 >>>> || !defined(__thumb__). Rationale is that load to pc does interworking >>>> since ARMv5, but without __thumb__ it does what we need even on ARMv4. >>>> >>> >>> OK, this appears to build and run fine when built for ARMv5/arm and >>> ARMv5/thumb using the Ubuntu softfloat toolchains (arm-linux-gnueabi) >>> >>> The only use case that we may break is where someone links to the >>> internal libcrypto.so symbols directly, and calls them from thumb code >>> on an ARMv4t but I guess if you deserve the pain in that case :-) >> >> Correct. Those who attempt to run Thumb binary linked directly to e.g. >> AES_encrypt in "universal" non-Thumb libcrypto.so specifically on ARMv4t >> processor will suffer. > > Well, it probably should be noted that Thumb code with non-Thumb shared > library on ARMv4 takes special compiler. At least stock gcc doesn't > generate those magic epilogues with moveq pc,lr, but simply issues > "target CPU does not support interworking" warning. And passing > -march=armv4t simply generates bx lr in both cases, i.e. with and > without -mthumb... So that proposed pre-processor logic is actually out > of sync with stock compiler and it makes sense to harmonize... I'll > ponder and post another version.
Attached patch harmonizes interworking logic with stock compiler. The only difference from previous version is that all [!]defined(__thumb__) are replaced with [!]defined(__ARM_ARCH_4T__), so that it adheres to bx lr with -march=armv4t regardless accompanying -mthumb.
diff --git a/crypto/aes/asm/aes-armv4.pl b/crypto/aes/asm/aes-armv4.pl index a620a7c..54dcd5b 100644 --- a/crypto/aes/asm/aes-armv4.pl +++ b/crypto/aes/asm/aes-armv4.pl @@ -275,13 +275,11 @@ AES_encrypt: strb $t3,[$rounds,#14] strb $s3,[$rounds,#15] #endif -#if __ARM_ARCH__>=5 +#if __ARM_ARCH__>=5 || !defined(__ARM_ARCH_4T__) ldmia sp!,{r4-r12,pc} #else ldmia sp!,{r4-r12,lr} - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet - bx lr @ interoperable with Thumb ISA:-) + bx lr @ interwork with v4t:-) #endif .size AES_encrypt,.-AES_encrypt @@ -715,12 +713,10 @@ _armv4_AES_set_encrypt_key: .Ldone: mov r0,#0 ldmia sp!,{r4-r12,lr} .Labrt: -#if __ARM_ARCH__>=5 - ret @ bx lr +#if __ARM_ARCH__>=5 || defined(__ARM_ARCH_4T__) + bx lr #else - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet - bx lr @ interoperable with Thumb ISA:-) + mov pc,lr #endif .size AES_set_encrypt_key,.-AES_set_encrypt_key @@ -831,13 +827,11 @@ $code.=<<___; bne .Lmix mov r0,#0 -#if __ARM_ARCH__>=5 +#if __ARM_ARCH__>=5 || !defined(__ARM_ARCH_4T__) ldmia sp!,{r4-r12,pc} #else ldmia sp!,{r4-r12,lr} - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet - bx lr @ interoperable with Thumb ISA:-) + bx lr @ interwork with v4t:-) #endif .size AES_set_enc2dec_key,.-AES_set_enc2dec_key @@ -1043,13 +1037,11 @@ AES_decrypt: strb $t3,[$rounds,#14] strb $s3,[$rounds,#15] #endif -#if __ARM_ARCH__>=5 +#if __ARM_ARCH__>=5 || !defined(__ARM_ARCH_4T__) ldmia sp!,{r4-r12,pc} #else ldmia sp!,{r4-r12,lr} - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet - bx lr @ interoperable with Thumb ISA:-) + bx lr @ interwork with v4t:-) #endif .size AES_decrypt,.-AES_decrypt @@ -1202,9 +1194,6 @@ _armv4_AES_decrypt: .align 2 ___ -$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 -$code =~ s/\bret\b/bx\tlr/gm; - open SELF,$0; while(<SELF>) { next if (/^#!/); diff --git a/crypto/arm_arch.h b/crypto/arm_arch.h index 373b3d7..2f7f0cc 100644 --- a/crypto/arm_arch.h +++ b/crypto/arm_arch.h @@ -4,6 +4,9 @@ #if !defined(__ARM_ARCH__) # if defined(__CC_ARM) # define __ARM_ARCH__ __TARGET_ARCH_ARM +# if defined(__TARGET_ARCH_4T) +# define __ARM_ARCH_4T__ +# endif # if defined(__BIG_ENDIAN) # define __ARMEB__ # else diff --git a/crypto/armv4cpuid.S b/crypto/armv4cpuid.S index 65010ae..d9fc07d 100644 --- a/crypto/armv4cpuid.S +++ b/crypto/armv4cpuid.S @@ -33,10 +33,12 @@ OPENSSL_atomic_add: add r2,r2,r5 str r2,[r4] str r0,[r6] @ release spinlock +#if __ARM_ARCH__>=5 || !defined(__ARM_ARCH_4T__) + ldmia sp!,{r4-r6,pc} +#else ldmia sp!,{r4-r6,lr} - tst lr,#1 - moveq pc,lr - .word 0xe12fff1e @ bx lr + bx lr +#endif #endif .size OPENSSL_atomic_add,.-OPENSSL_atomic_add @@ -67,12 +69,10 @@ OPENSSL_cleanse: adds r1,r1,#4 bne .Little .Lcleanse_done: -#if __ARM_ARCH__>=5 +#if __ARM_ARCH__>=5 || defined(__ARM_ARCH_4T__) bx lr #else - tst lr,#1 - moveq pc,lr - .word 0xe12fff1e @ bx lr + mov pc,lr #endif .size OPENSSL_cleanse,.-OPENSSL_cleanse @@ -152,12 +152,10 @@ OPENSSL_wipe_cpu: .Lwipe_done: #endif mov r0,sp -#if __ARM_ARCH__>=5 +#if __ARM_ARCH__>=5 || defined(__ARM_ARCH_4T__) bx lr #else - tst lr,#1 - moveq pc,lr - .word 0xe12fff1e @ bx lr + mov pc,lr #endif .size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu @@ -165,12 +163,10 @@ OPENSSL_wipe_cpu: .type OPENSSL_instrument_bus,%function OPENSSL_instrument_bus: eor r0,r0,r0 -#if __ARM_ARCH__>=5 +#if __ARM_ARCH__>=5 || defined(__ARM_ARCH_4T__) bx lr #else - tst lr,#1 - moveq pc,lr - .word 0xe12fff1e @ bx lr + mov pc,lr #endif .size OPENSSL_instrument_bus,.-OPENSSL_instrument_bus @@ -178,12 +174,10 @@ OPENSSL_instrument_bus: .type OPENSSL_instrument_bus2,%function OPENSSL_instrument_bus2: eor r0,r0,r0 -#if __ARM_ARCH__>=5 +#if __ARM_ARCH__>=5 || defined(__ARM_ARCH_4T__) bx lr #else - tst lr,#1 - moveq pc,lr - .word 0xe12fff1e @ bx lr + mov pc,lr #endif .size OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2 diff --git a/crypto/bn/asm/armv4-gf2m.pl b/crypto/bn/asm/armv4-gf2m.pl index 8f529c9..c61238e 100644 --- a/crypto/bn/asm/armv4-gf2m.pl +++ b/crypto/bn/asm/armv4-gf2m.pl @@ -191,13 +191,11 @@ $code.=<<___; add sp,sp,#32 @ destroy tab[8] str $lo,[$ret,#4] -#if __ARM_ARCH__>=5 +#if __ARM_ARCH__>=5 || !defined(__ARM_ARCH_4T__) ldmia sp!,{r4-r10,pc} #else ldmia sp!,{r4-r10,lr} - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet - bx lr @ interoperable with Thumb ISA:-) + bx lr @ interwork with v4t:-) #endif ___ } @@ -258,7 +256,7 @@ $code.=<<___; veor $r, $r, $t2 vst1.32 {$r}, [r0] - ret @ bx lr + bx lr #endif ___ } @@ -280,9 +278,7 @@ ___ foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval $1/geo; - s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or - s/\bret\b/bx lr/go or - s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 + s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo; print $_,"\n"; } diff --git a/crypto/bn/asm/armv4-mont.pl b/crypto/bn/asm/armv4-mont.pl index 1d330e9..c27bd73 100644 --- a/crypto/bn/asm/armv4-mont.pl +++ b/crypto/bn/asm/armv4-mont.pl @@ -231,12 +231,10 @@ bn_mul_mont: add sp,sp,#2*4 @ skip over {r0,r2} mov r0,#1 .Labrt: -#if __ARM_ARCH__>=5 - ret @ bx lr +#if __ARM_ARCH__>=5 || defined(__ARM_ARCH_4T__) + bx lr #else - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet - bx lr @ interoperable with Thumb ISA:-) + mov pc,lr #endif .size bn_mul_mont,.-bn_mul_mont ___ @@ -656,7 +654,7 @@ bn_mul8x_mont_neon: sub sp,ip,#96 vldmia sp!,{d8-d15} ldmia sp!,{r4-r11} - ret @ bx lr + bx lr .size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon #endif ___ @@ -670,7 +668,5 @@ $code.=<<___; ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; -$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 -$code =~ s/\bret\b/bx lr/gm; print $code; close STDOUT; diff --git a/crypto/modes/asm/ghash-armv4.pl b/crypto/modes/asm/ghash-armv4.pl index 77fbf34..15576cd 100644 --- a/crypto/modes/asm/ghash-armv4.pl +++ b/crypto/modes/asm/ghash-armv4.pl @@ -229,13 +229,11 @@ $code.=<<___; bne .Louter add sp,sp,#36 -#if __ARM_ARCH__>=5 +#if __ARM_ARCH__>=5 || !defined(__ARM_ARCH_4T__) ldmia sp!,{r4-r11,pc} #else ldmia sp!,{r4-r11,lr} - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet - bx lr @ interoperable with Thumb ISA:-) + bx lr @ interwork with v4t:-) #endif .size gcm_ghash_4bit,.-gcm_ghash_4bit @@ -306,13 +304,11 @@ gcm_gmult_4bit: ___ &Zsmash(); $code.=<<___; -#if __ARM_ARCH__>=5 +#if __ARM_ARCH__>=5 || !defined(__ARM_ARCH_4T__) ldmia sp!,{r4-r11,pc} #else ldmia sp!,{r4-r11,lr} - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet - bx lr @ interoperable with Thumb ISA:-) + bx lr @ interwork with v4t:-) #endif .size gcm_gmult_4bit,.-gcm_gmult_4bit ___ @@ -387,7 +383,7 @@ gcm_init_neon: veor $IN,$IN,$t0 @ twisted H vstmia r0,{$IN} - ret @ bx lr + bx lr .size gcm_init_neon,.-gcm_init_neon .global gcm_gmult_neon @@ -471,7 +467,7 @@ $code.=<<___; vst1.64 $Xl#hi,[$Xi,:64]! @ write out Xi vst1.64 $Xl#lo,[$Xi,:64] - ret @ bx lr + bx lr .size gcm_ghash_neon,.-gcm_ghash_neon #endif ___ @@ -484,9 +480,7 @@ ___ foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval $1/geo; - s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or - s/\bret\b/bx lr/go or - s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 + s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo; print $_,"\n"; } diff --git a/crypto/sha/asm/sha1-armv4-large.pl b/crypto/sha/asm/sha1-armv4-large.pl index b2c3032..d1c3b4f 100644 --- a/crypto/sha/asm/sha1-armv4-large.pl +++ b/crypto/sha/asm/sha1-armv4-large.pl @@ -249,13 +249,11 @@ $code.=<<___; teq $inp,$len bne .Lloop @ [+18], total 1307 -#if __ARM_ARCH__>=5 +#if __ARM_ARCH__>=5 || !defined(__ARM_ARCH_4T__) ldmia sp!,{r4-r12,pc} #else ldmia sp!,{r4-r12,lr} - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet - bx lr @ interoperable with Thumb ISA:-) + bx lr @ interwork with v4t:-) #endif .size sha1_block_data_order,.-sha1_block_data_order @@ -634,7 +632,7 @@ $code.=<<___; vst1.32 {$E\[0]},[$ctx] vldmia sp!,{d8-d15} - ret @ bx lr + bx lr .size sha1_block_data_order_armv8,.-sha1_block_data_order_armv8 #endif ___ @@ -674,9 +672,6 @@ foreach (split($/,$code)) { s/\b(sha1\w+)\s+(q.*)/unsha1($1,$2)/geo; - s/\bret\b/bx lr/o or - s/\bbx\s+lr\b/.word\t0xe12fff1e/o; # make it possible to compile with -march=armv4 - print $_,$/; } diff --git a/crypto/sha/asm/sha256-armv4.pl b/crypto/sha/asm/sha256-armv4.pl index b0ae936..84998d3 100644 --- a/crypto/sha/asm/sha256-armv4.pl +++ b/crypto/sha/asm/sha256-armv4.pl @@ -241,13 +241,11 @@ $code.=<<___; bne .Loop add sp,sp,#`16+3`*4 @ destroy frame -#if __ARM_ARCH__>=5 +#if __ARM_ARCH__>=5 || !defined(__ARM_ARCH_4T__) ldmia sp!,{r4-r11,pc} #else ldmia sp!,{r4-r11,lr} - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet - bx lr @ interoperable with Thumb ISA:-) + bx lr @ interwork with v4t:-) #endif .size sha256_block_data_order,.-sha256_block_data_order ___ @@ -611,7 +609,7 @@ $code.=<<___; vst1.32 {$ABCD,$EFGH},[$ctx] - ret @ bx lr + bx lr .size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8 #endif ___ @@ -652,9 +650,6 @@ foreach (split($/,$code)) { s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo; - s/\bret\b/bx lr/go or - s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 - print $_,"\n"; } diff --git a/crypto/sha/asm/sha512-armv4.pl b/crypto/sha/asm/sha512-armv4.pl index fb7dc50..a666a23 100644 --- a/crypto/sha/asm/sha512-armv4.pl +++ b/crypto/sha/asm/sha512-armv4.pl @@ -445,13 +445,11 @@ $code.=<<___; bne .Loop add sp,sp,#8*9 @ destroy frame -#if __ARM_ARCH__>=5 +#if __ARM_ARCH__>=5 || !defined(__ARM_ARCH_4T__) ldmia sp!,{r4-r12,pc} #else ldmia sp!,{r4-r12,lr} - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet - bx lr @ interoperable with Thumb ISA:-) + bx lr @ interwork with v4t:-) #endif ___ @@ -589,7 +587,7 @@ $code.=<<___; bne .Loop_neon vldmia sp!,{d8-d15} @ epilogue - ret @ bx lr + bx lr #endif ___ } @@ -603,7 +601,5 @@ $code.=<<___; ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; -$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 -$code =~ s/\bret\b/bx lr/gm; print $code; close STDOUT; # enforce flush