Ping --- The attached patch updates the blackfin ___divsi3 and ___udivsi3 implementations (and updates ___umodsi3 to match), as well as adding .size directives to all functions in the file.
2011-03-24 Stuart Henderson <stuart.hender...@analog.com> * gcc/config/bfin/lib1funcs.asm (___divsi3): New implementation, add .size directive and unguard .text directive. (___udivsi3): New implementation and add .size directive. (___umodsi3): Update to match new ___divsi3/___udivsi3 implementations and add .size directive. (___modsi3): Add .size directive. (___umulsi3_highpart): Likewise. (___smulsi3_highpart): Likewise. Thanks, Stu Index: gcc/config/bfin/lib1funcs.asm =================================================================== --- gcc/config/bfin/lib1funcs.asm (revision 171215) +++ gcc/config/bfin/lib1funcs.asm (working copy) @@ -23,36 +23,101 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see <http://www.gnu.org/licenses/>. */ +#if defined(__ELF__) && defined(__linux__) +.section .note.GNU-stack,"",%progbits +#endif + +.text + #ifdef L_divsi3 -.text .align 2 .global ___divsi3; .type ___divsi3, STT_FUNC; ___divsi3: - [--SP]= RETS; - [--SP] = R7; - R2 = -R0; - CC = R0 < 0; - IF CC R0 = R2; - R7 = CC; - +.Ls_main_branch: + R3 = R0 ^ R1; + R2 = - R0; + R0 = MAX(R0,R2); R2 = -R1; - CC = R1 < 0; - IF CC R1 = R2; - R2 = CC; - R7 = R7 ^ R2; + R1 = MAX(R1,R2); + R2 = R0 >> 1; + CC = R2 < R1 (IU); + IF CC JUMP .Ls_Q_has_only_one_bit; - CALL ___udivsi3; + P1 = R3; + R3 = R1>>1; + R3.L = SIGNBITS R3; + R1 = LSHIFT R1 BY R3.L; + R2 = R1 << 15; + CC = R2 == 0; + IF !CC JUMP .Ls_use_sfw_D_has_16bit_or_more; - CC = R7; + R2.L = SIGNBITS R0; + R0 = LSHIFT R0 BY R2.L; + R2.L = R3.L - R2.L (NS); + P2 = R2; + CC = R0 == R1; + IF CC JUMP .Ls_N_is_MIN_D_is_1_bit_set; + + R1 >>= 17; + +.Ls_use_divq_main_branch: + AQ = CC; + + LOOP(s_lp_use_divq) LC0 = P2; + LOOP_BEGIN s_lp_use_divq; + DIVQ(R0, R1); + LOOP_END s_lp_use_divq; + + R0 = EXTRACT(R0, R2.L) (Z); R1 = -R0; + CC = P1<0; IF CC R0 = R1; + RTS; - R7 = [SP++]; - RETS = [SP++]; - RTS; +.Ls_N_is_MIN_D_is_1_bit_set: + R0 = 1; + R0 = LSHIFT R0 BY R2.L; + R1 = -R0; + CC = P1 < 0; + IF CC R0 = R1; + RTS; + +.Ls_use_sfw_D_has_16bit_or_more: + R2 = R0 >> 1; + R2.L = SIGNBITS R2; + R3.H = R3.L - R2.L (NS); + R3 = R3 >>16; + P2 = R3; + R0 = LSHIFT R0 BY R2.L; + R0 = R0 - R1; + CC = !BITTST(R0, 31); + R1 >>>= 1; + + LOOP(s__use_sfw_loop) LC0 = P2; + LOOP_BEGIN s__use_sfw_loop; + R0 = R0 + R1, R2 = R0 - R1; + IF CC R0 = R2; + R0 = ROT R0 BY 1; + LOOP_END s__use_sfw_loop; + + R0 = EXTRACT(R0, R3.L)(Z); + R0 = ROT R0 BY 1; + R1 = -R0; + CC = P1<0; + IF CC R0 = R1; + RTS; + +.Ls_Q_has_only_one_bit: + CC = R1 <= R0 (IU); + R0 = CC; + R1 = -R0; + CC = R3<0; + IF CC R0 = R1; + RTS; + .size ___divsi3, .-___divsi3; #endif #ifdef L_modsi3 @@ -71,6 +136,8 @@ R0 = R1 - R2; RETS = [SP++]; RTS; + +.size ___modsi3, .-___modsi3 #endif #ifdef L_udivsi3 @@ -79,26 +146,87 @@ .type ___udivsi3, STT_FUNC; ___udivsi3: - P0 = 32; - LSETUP (0f, 1f) LC0 = P0; - /* upper half of dividend */ - R3 = 0; -0: - /* The first time round in the loop we shift in garbage, but since we - perform 33 shifts, it doesn't matter. */ +.Lu_main_branch: + R2 = R0 >> 1; + CC = R2 < R1 (IU); + IF CC JUMP .Lu_Q_has_only_one_bit; + + R3 = R1 >> 1; + R3.L = SIGNBITS R3; + R1 = LSHIFT R1 BY R3.L; + R2 = R1 << 15; + CC = R2 == 0; + IF !CC JUMP .Lu_use_sfw_D_has_16bit_or_more; + + CC = R0 < 0; + IF CC JUMP .Lu_MSB_of_N_is_1; + + R1.L = SIGNBITS R0; + R2.L = R3.L - R1.L (NS); + P2 = R2; + R0 = LSHIFT R0 BY R1.L; + R1 >>= 17; + +.Lu_use_divq_main_branch: + AQ = CC; + + LOOP(u_lp_use_divq_when_MSB_of_N_is_0) LC0 = P2; + LOOP_BEGIN u_lp_use_divq_when_MSB_of_N_is_0; + DIVQ(R0, R1); + LOOP_END u_lp_use_divq_when_MSB_of_N_is_0; + + R0 = EXTRACT(R0, R2.L) (Z); + RTS; + +.Lu_MSB_of_N_is_1: + R3 = R3.L (Z); + P2 = R3; + R0 = R0 - R1; + R1 >>= 17; + +.Lu_use_divq_when_MSB_of_N_is_1: + R2 = ~R0; + R2 = R2 >> 31; + CC = BITTST(R0, 31); + AQ = CC; + + LOOP(u_lp_use_divq_MSB_of_N_is_1) LC0 = P2; + LOOP_BEGIN u_lp_use_divq_MSB_of_N_is_1; + DIVQ(R0, R1); + LOOP_END u_lp_use_divq_MSB_of_N_is_1; + + R2 = LSHIFT R2 BY R3.L; + R0 = EXTRACT(R0, R3.L) (Z); + R0 = R0+R2; + RTS; + +.Lu_use_sfw_D_has_16bit_or_more: + R2 = R0>>1; + R2.L = SIGNBITS R2; + R3.H = R3.L - R2.L (NS); + R3 = R3 >> 16; + P2 = R3; + R0 = LSHIFT R0 BY R2.L; + R0 = R0 - R1; + CC = !BITTST(R0, 31); + R1 >>>= 1; + + LOOP(u__use_sfw_loop) LC0 = P2; + LOOP_BEGIN u__use_sfw_loop; + R0 = R0 + R1, R2 = R0 - R1; + IF CC R0 = R2; R0 = ROT R0 BY 1; - R3 = ROT R3 BY 1; - R2 = R3 - R1; - CC = R3 < R1 (IU); -1: - /* Last instruction of the loop. */ - IF ! CC R3 = R2; + LOOP_END u__use_sfw_loop; - /* Shift in the last bit. */ + R0 = EXTRACT(R0, R3.L)(Z); R0 = ROT R0 BY 1; - /* R0 is the result, R3 contains the remainder. */ - R0 = ~ R0; - RTS; + RTS; + +.Lu_Q_has_only_one_bit: + CC = R1 <= R0 (IU); + R0 = CC; + RTS; + .size ___udivsi3, .-___udivsi3; #endif #ifdef L_umodsi3 @@ -108,10 +236,17 @@ ___umodsi3: [--SP] = RETS; + [--SP] = R0; + [--SP] = R1; CALL ___udivsi3; - R0 = R3; - RETS = [SP++]; - RTS; + R2 = [SP++]; + R1 = [SP++]; + R2 *= R0; + R0 = R1 - R2; + RETS = [SP++]; + RTS; + +.size ___umodsi3, .-___umodsi3 #endif #ifdef L_umulsi3_highpart @@ -128,6 +263,8 @@ A0 += A1; R0 = A0 (FU); RTS; + +.size ___umulsi3_highpart, .-___umulsi3_highpart #endif #ifdef L_smulsi3_highpart @@ -143,4 +280,6 @@ A1 = A1 >>> 16; R0 = (A0 += A1); RTS; + +.size ___smulsi3_highpart, .-___smulsi3_highpart #endif