>>>> Attached is promised patch that reworks
>>>> interworking logic. As mentioned earlier idea is to use __ARM_ARCH__>=5
>>>> || !defined(__thumb__). Rationale is that load to pc does interworking
>>>> since ARMv5, but without __thumb__ it does what we need even on ARMv4.
>>>>
>>>
>>> OK, this appears to build and run fine when built for ARMv5/arm and
>>> ARMv5/thumb using the Ubuntu softfloat toolchains (arm-linux-gnueabi)
>>>
>>> The only use case that we may break is where someone links to the
>>> internal libcrypto.so symbols directly, and calls them from thumb code
>>> on an ARMv4t but I guess if you deserve the pain in that case :-)
>>
>> Correct. Those who attempt to run Thumb binary linked directly to e.g.
>> AES_encrypt in "universal" non-Thumb libcrypto.so specifically on ARMv4t
>> processor will suffer.
> 
> Well, it probably should be noted that Thumb code with non-Thumb shared
> library on ARMv4 takes special compiler. At least stock gcc doesn't
> generate those magic epilogues with moveq pc,lr, but simply issues
> "target CPU does not support interworking" warning. And passing
> -march=armv4t simply generates bx lr in both cases, i.e. with and
> without -mthumb... So that proposed pre-processor logic is actually out
> of sync with stock compiler and it makes sense to harmonize... I'll
> ponder and post another version.

Attached patch harmonizes interworking logic with stock compiler. The
only difference from previous version is that all [!]defined(__thumb__)
are replaced with [!]defined(__ARM_ARCH_4T__), so that it adheres to bx
lr with -march=armv4t regardless accompanying -mthumb.

diff --git a/crypto/aes/asm/aes-armv4.pl b/crypto/aes/asm/aes-armv4.pl
index a620a7c..54dcd5b 100644
--- a/crypto/aes/asm/aes-armv4.pl
+++ b/crypto/aes/asm/aes-armv4.pl
@@ -275,13 +275,11 @@ AES_encrypt:
 	strb	$t3,[$rounds,#14]
 	strb	$s3,[$rounds,#15]
 #endif
-#if __ARM_ARCH__>=5
+#if __ARM_ARCH__>=5 || !defined(__ARM_ARCH_4T__)
 	ldmia	sp!,{r4-r12,pc}
 #else
 	ldmia   sp!,{r4-r12,lr}
-	tst	lr,#1
-	moveq	pc,lr			@ be binary compatible with V4, yet
-	bx	lr			@ interoperable with Thumb ISA:-)
+	bx	lr			@ interwork with v4t:-)
 #endif
 .size	AES_encrypt,.-AES_encrypt
 
@@ -715,12 +713,10 @@ _armv4_AES_set_encrypt_key:
 .Ldone:	mov	r0,#0
 	ldmia   sp!,{r4-r12,lr}
 .Labrt:
-#if __ARM_ARCH__>=5
-	ret				@ bx lr
+#if __ARM_ARCH__>=5 || defined(__ARM_ARCH_4T__)
+	bx	lr
 #else
-	tst	lr,#1
-	moveq	pc,lr			@ be binary compatible with V4, yet
-	bx	lr			@ interoperable with Thumb ISA:-)
+	mov	pc,lr
 #endif
 .size	AES_set_encrypt_key,.-AES_set_encrypt_key
 
@@ -831,13 +827,11 @@ $code.=<<___;
 	bne	.Lmix
 
 	mov	r0,#0
-#if __ARM_ARCH__>=5
+#if __ARM_ARCH__>=5 || !defined(__ARM_ARCH_4T__)
 	ldmia	sp!,{r4-r12,pc}
 #else
 	ldmia   sp!,{r4-r12,lr}
-	tst	lr,#1
-	moveq	pc,lr			@ be binary compatible with V4, yet
-	bx	lr			@ interoperable with Thumb ISA:-)
+	bx	lr			@ interwork with v4t:-)
 #endif
 .size	AES_set_enc2dec_key,.-AES_set_enc2dec_key
 
@@ -1043,13 +1037,11 @@ AES_decrypt:
 	strb	$t3,[$rounds,#14]
 	strb	$s3,[$rounds,#15]
 #endif
-#if __ARM_ARCH__>=5
+#if __ARM_ARCH__>=5 || !defined(__ARM_ARCH_4T__)
 	ldmia	sp!,{r4-r12,pc}
 #else
 	ldmia   sp!,{r4-r12,lr}
-	tst	lr,#1
-	moveq	pc,lr			@ be binary compatible with V4, yet
-	bx	lr			@ interoperable with Thumb ISA:-)
+	bx	lr			@ interwork with v4t:-)
 #endif
 .size	AES_decrypt,.-AES_decrypt
 
@@ -1202,9 +1194,6 @@ _armv4_AES_decrypt:
 .align	2
 ___
 
-$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
-$code =~ s/\bret\b/bx\tlr/gm;
-
 open SELF,$0;
 while(<SELF>) {
 	next if (/^#!/);
diff --git a/crypto/arm_arch.h b/crypto/arm_arch.h
index 373b3d7..2f7f0cc 100644
--- a/crypto/arm_arch.h
+++ b/crypto/arm_arch.h
@@ -4,6 +4,9 @@
 #if !defined(__ARM_ARCH__)
 # if defined(__CC_ARM)
 #  define __ARM_ARCH__ __TARGET_ARCH_ARM
+#  if defined(__TARGET_ARCH_4T)
+#   define __ARM_ARCH_4T__
+#  endif
 #  if defined(__BIG_ENDIAN)
 #   define __ARMEB__
 #  else
diff --git a/crypto/armv4cpuid.S b/crypto/armv4cpuid.S
index 65010ae..d9fc07d 100644
--- a/crypto/armv4cpuid.S
+++ b/crypto/armv4cpuid.S
@@ -33,10 +33,12 @@ OPENSSL_atomic_add:
 	add	r2,r2,r5
 	str	r2,[r4]
 	str	r0,[r6]		@ release spinlock
+#if __ARM_ARCH__>=5 || !defined(__ARM_ARCH_4T__)
+	ldmia	sp!,{r4-r6,pc}
+#else
 	ldmia	sp!,{r4-r6,lr}
-	tst	lr,#1
-	moveq	pc,lr
-	.word	0xe12fff1e	@ bx	lr
+	bx	lr
+#endif
 #endif
 .size	OPENSSL_atomic_add,.-OPENSSL_atomic_add
 
@@ -67,12 +69,10 @@ OPENSSL_cleanse:
 	adds	r1,r1,#4
 	bne	.Little
 .Lcleanse_done:
-#if __ARM_ARCH__>=5
+#if __ARM_ARCH__>=5 || defined(__ARM_ARCH_4T__)
 	bx	lr
 #else
-	tst	lr,#1
-	moveq	pc,lr
-	.word	0xe12fff1e	@ bx	lr
+	mov	pc,lr
 #endif
 .size	OPENSSL_cleanse,.-OPENSSL_cleanse
 
@@ -152,12 +152,10 @@ OPENSSL_wipe_cpu:
 .Lwipe_done:
 #endif
 	mov	r0,sp
-#if __ARM_ARCH__>=5
+#if __ARM_ARCH__>=5 || defined(__ARM_ARCH_4T__)
 	bx	lr
 #else
-	tst	lr,#1
-	moveq	pc,lr
-	.word	0xe12fff1e	@ bx	lr
+	mov	pc,lr
 #endif
 .size	OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
 
@@ -165,12 +163,10 @@ OPENSSL_wipe_cpu:
 .type	OPENSSL_instrument_bus,%function
 OPENSSL_instrument_bus:
 	eor	r0,r0,r0
-#if __ARM_ARCH__>=5
+#if __ARM_ARCH__>=5 || defined(__ARM_ARCH_4T__)
 	bx	lr
 #else
-	tst	lr,#1
-	moveq	pc,lr
-	.word	0xe12fff1e	@ bx	lr
+	mov	pc,lr
 #endif
 .size	OPENSSL_instrument_bus,.-OPENSSL_instrument_bus
 
@@ -178,12 +174,10 @@ OPENSSL_instrument_bus:
 .type	OPENSSL_instrument_bus2,%function
 OPENSSL_instrument_bus2:
 	eor	r0,r0,r0
-#if __ARM_ARCH__>=5
+#if __ARM_ARCH__>=5 || defined(__ARM_ARCH_4T__)
 	bx	lr
 #else
-	tst	lr,#1
-	moveq	pc,lr
-	.word	0xe12fff1e	@ bx	lr
+	mov	pc,lr
 #endif
 .size	OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2
 
diff --git a/crypto/bn/asm/armv4-gf2m.pl b/crypto/bn/asm/armv4-gf2m.pl
index 8f529c9..c61238e 100644
--- a/crypto/bn/asm/armv4-gf2m.pl
+++ b/crypto/bn/asm/armv4-gf2m.pl
@@ -191,13 +191,11 @@ $code.=<<___;
 	add	sp,sp,#32		@ destroy tab[8]
 	str	$lo,[$ret,#4]
 
-#if __ARM_ARCH__>=5
+#if __ARM_ARCH__>=5 || !defined(__ARM_ARCH_4T__)
 	ldmia	sp!,{r4-r10,pc}
 #else
 	ldmia	sp!,{r4-r10,lr}
-	tst	lr,#1
-	moveq	pc,lr			@ be binary compatible with V4, yet
-	bx	lr			@ interoperable with Thumb ISA:-)
+	bx	lr			@ interwork with v4t:-)
 #endif
 ___
 }
@@ -258,7 +256,7 @@ $code.=<<___;
 	veor		$r, $r, $t2
 
 	vst1.32		{$r}, [r0]
-	ret		@ bx lr
+	bx		lr
 #endif
 ___
 }
@@ -280,9 +278,7 @@ ___
 foreach (split("\n",$code)) {
 	s/\`([^\`]*)\`/eval $1/geo;
 
-	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo	or
-	s/\bret\b/bx	lr/go		or
-	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;    # make it possible to compile with -march=armv4
+	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
 
 	print $_,"\n";
 }
diff --git a/crypto/bn/asm/armv4-mont.pl b/crypto/bn/asm/armv4-mont.pl
index 1d330e9..c27bd73 100644
--- a/crypto/bn/asm/armv4-mont.pl
+++ b/crypto/bn/asm/armv4-mont.pl
@@ -231,12 +231,10 @@ bn_mul_mont:
 	add	sp,sp,#2*4		@ skip over {r0,r2}
 	mov	r0,#1
 .Labrt:
-#if __ARM_ARCH__>=5
-	ret				@ bx lr
+#if __ARM_ARCH__>=5 || defined(__ARM_ARCH_4T__)
+	bx	lr
 #else
-	tst	lr,#1
-	moveq	pc,lr			@ be binary compatible with V4, yet
-	bx	lr			@ interoperable with Thumb ISA:-)
+	mov	pc,lr
 #endif
 .size	bn_mul_mont,.-bn_mul_mont
 ___
@@ -656,7 +654,7 @@ bn_mul8x_mont_neon:
 	sub	sp,ip,#96
         vldmia  sp!,{d8-d15}
         ldmia   sp!,{r4-r11}
-	ret						@ bx lr
+	bx	lr
 .size	bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
 #endif
 ___
@@ -670,7 +668,5 @@ $code.=<<___;
 ___
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
-$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
-$code =~ s/\bret\b/bx	lr/gm;
 print $code;
 close STDOUT;
diff --git a/crypto/modes/asm/ghash-armv4.pl b/crypto/modes/asm/ghash-armv4.pl
index 77fbf34..15576cd 100644
--- a/crypto/modes/asm/ghash-armv4.pl
+++ b/crypto/modes/asm/ghash-armv4.pl
@@ -229,13 +229,11 @@ $code.=<<___;
 	bne	.Louter
 
 	add	sp,sp,#36
-#if __ARM_ARCH__>=5
+#if __ARM_ARCH__>=5 || !defined(__ARM_ARCH_4T__)
 	ldmia	sp!,{r4-r11,pc}
 #else
 	ldmia	sp!,{r4-r11,lr}
-	tst	lr,#1
-	moveq	pc,lr			@ be binary compatible with V4, yet
-	bx	lr			@ interoperable with Thumb ISA:-)
+	bx	lr			@ interwork with v4t:-)
 #endif
 .size	gcm_ghash_4bit,.-gcm_ghash_4bit
 
@@ -306,13 +304,11 @@ gcm_gmult_4bit:
 ___
 	&Zsmash();
 $code.=<<___;
-#if __ARM_ARCH__>=5
+#if __ARM_ARCH__>=5 || !defined(__ARM_ARCH_4T__)
 	ldmia	sp!,{r4-r11,pc}
 #else
 	ldmia	sp!,{r4-r11,lr}
-	tst	lr,#1
-	moveq	pc,lr			@ be binary compatible with V4, yet
-	bx	lr			@ interoperable with Thumb ISA:-)
+	bx	lr			@ interwork with v4t:-)
 #endif
 .size	gcm_gmult_4bit,.-gcm_gmult_4bit
 ___
@@ -387,7 +383,7 @@ gcm_init_neon:
 	veor		$IN,$IN,$t0		@ twisted H
 	vstmia		r0,{$IN}
 
-	ret					@ bx lr
+	bx		lr
 .size	gcm_init_neon,.-gcm_init_neon
 
 .global	gcm_gmult_neon
@@ -471,7 +467,7 @@ $code.=<<___;
 	vst1.64		$Xl#hi,[$Xi,:64]!	@ write out Xi
 	vst1.64		$Xl#lo,[$Xi,:64]
 
-	ret					@ bx lr
+	bx		lr
 .size	gcm_ghash_neon,.-gcm_ghash_neon
 #endif
 ___
@@ -484,9 +480,7 @@ ___
 foreach (split("\n",$code)) {
 	s/\`([^\`]*)\`/eval $1/geo;
 
-	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo	or
-	s/\bret\b/bx	lr/go		or
-	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;    # make it possible to compile with -march=armv4
+	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
 
 	print $_,"\n";
 }
diff --git a/crypto/sha/asm/sha1-armv4-large.pl b/crypto/sha/asm/sha1-armv4-large.pl
index b2c3032..d1c3b4f 100644
--- a/crypto/sha/asm/sha1-armv4-large.pl
+++ b/crypto/sha/asm/sha1-armv4-large.pl
@@ -249,13 +249,11 @@ $code.=<<___;
 	teq	$inp,$len
 	bne	.Lloop			@ [+18], total 1307
 
-#if __ARM_ARCH__>=5
+#if __ARM_ARCH__>=5 || !defined(__ARM_ARCH_4T__)
 	ldmia	sp!,{r4-r12,pc}
 #else
 	ldmia	sp!,{r4-r12,lr}
-	tst	lr,#1
-	moveq	pc,lr			@ be binary compatible with V4, yet
-	bx	lr			@ interoperable with Thumb ISA:-)
+	bx	lr			@ interwork with v4t:-)
 #endif
 .size	sha1_block_data_order,.-sha1_block_data_order
 
@@ -634,7 +632,7 @@ $code.=<<___;
 	vst1.32		{$E\[0]},[$ctx]
 
 	vldmia	sp!,{d8-d15}
-	ret					@ bx lr
+	bx	lr
 .size	sha1_block_data_order_armv8,.-sha1_block_data_order_armv8
 #endif
 ___
@@ -674,9 +672,6 @@ foreach (split($/,$code)) {
 
 	s/\b(sha1\w+)\s+(q.*)/unsha1($1,$2)/geo;
 
-	s/\bret\b/bx	lr/o		or
-	s/\bbx\s+lr\b/.word\t0xe12fff1e/o;	# make it possible to compile with -march=armv4
-
 	print $_,$/;
 }
 
diff --git a/crypto/sha/asm/sha256-armv4.pl b/crypto/sha/asm/sha256-armv4.pl
index b0ae936..84998d3 100644
--- a/crypto/sha/asm/sha256-armv4.pl
+++ b/crypto/sha/asm/sha256-armv4.pl
@@ -241,13 +241,11 @@ $code.=<<___;
 	bne	.Loop
 
 	add	sp,sp,#`16+3`*4	@ destroy frame
-#if __ARM_ARCH__>=5
+#if __ARM_ARCH__>=5 || !defined(__ARM_ARCH_4T__)
 	ldmia	sp!,{r4-r11,pc}
 #else
 	ldmia	sp!,{r4-r11,lr}
-	tst	lr,#1
-	moveq	pc,lr			@ be binary compatible with V4, yet
-	bx	lr			@ interoperable with Thumb ISA:-)
+	bx	lr			@ interwork with v4t:-)
 #endif
 .size	sha256_block_data_order,.-sha256_block_data_order
 ___
@@ -611,7 +609,7 @@ $code.=<<___;
 
 	vst1.32		{$ABCD,$EFGH},[$ctx]
 
-	ret		@ bx lr
+	bx		lr
 .size	sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
 #endif
 ___
@@ -652,9 +650,6 @@ foreach (split($/,$code)) {
 
 	s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
 
-	s/\bret\b/bx	lr/go		or
-	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;	# make it possible to compile with -march=armv4
-
 	print $_,"\n";
 }
 
diff --git a/crypto/sha/asm/sha512-armv4.pl b/crypto/sha/asm/sha512-armv4.pl
index fb7dc50..a666a23 100644
--- a/crypto/sha/asm/sha512-armv4.pl
+++ b/crypto/sha/asm/sha512-armv4.pl
@@ -445,13 +445,11 @@ $code.=<<___;
 	bne	.Loop
 
 	add	sp,sp,#8*9		@ destroy frame
-#if __ARM_ARCH__>=5
+#if __ARM_ARCH__>=5 || !defined(__ARM_ARCH_4T__)
 	ldmia	sp!,{r4-r12,pc}
 #else
 	ldmia	sp!,{r4-r12,lr}
-	tst	lr,#1
-	moveq	pc,lr			@ be binary compatible with V4, yet
-	bx	lr			@ interoperable with Thumb ISA:-)
+	bx	lr			@ interwork with v4t:-)
 #endif
 ___
 
@@ -589,7 +587,7 @@ $code.=<<___;
 	bne		.Loop_neon
 
 	vldmia	sp!,{d8-d15}		@ epilogue
-	ret				@ bx lr
+	bx	lr
 #endif
 ___
 }
@@ -603,7 +601,5 @@ $code.=<<___;
 ___
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
-$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
-$code =~ s/\bret\b/bx	lr/gm;
 print $code;
 close STDOUT; # enforce flush

Reply via email to