On 20 May 23:27, Vladimir Makarov wrote: > > > On 20/05/15 04:17 AM, Ilya Enkovich wrote: > >On 19 May 11:22, Vladimir Makarov wrote: > >>On 05/18/2015 08:13 AM, Ilya Enkovich wrote: > >>>2015-05-06 17:18 GMT+03:00 Ilya Enkovich <enkovich....@gmail.com>: > >>>Hi Vladimir, > >>> > >>>Could you please comment on this? > >>> > >>> > >>Ilya, I think that the idea is worth to try but results might be > >>mixed. It is hard to say until you actually try it (as example, Jan > >>implemented -fpmath=both and it looks a pretty good idea at least > >>for me but when I checked SPEC2000 the results were not so good even > >>with IRA/LRA). > >> > >>Long ago I did some experiments and found that spilling into SSE > >>would benefitial for Intel CPUs but not for AMD ones. As I remember > >>I also found that storing several scalar values into one SSE reg and > >>extracting it when you need to do some (fp) arithmetics would > >>benefitial for AMD but not for Intel CPUs. In literature more > >>general approach is called bitwise register allocator. Actually it > >>would be a pretty big IRA/LRA project from which some targets might > >>benefit. > >I suspect such things are not trivially done in IRA/LRA and want to make it > >as an independent optimization because its application seems to be quite > >narrow. > Yes, that is true. The complications and implementation complexity > will be probably very high in this project and the positive results > are not sure. So the project might have a small value. > >> > >>As for the wrong code, it is hard for me to say anything w/o RA > >>dumps. If you send me the dump (-fira-verbose=16), i might say more > >>what is going on. > >> > >> > >Here are some dumps from my reproducer. The problematic register is r108. > > > Thanks. For me it looks like an inheritance bug. It is really hard > to fix the bug w/o the source code. Could you send me your patch in > order I can debug RA with it to investigate more. >
Sure! Here is a patch and a testcase. I applied patch to r222125. Cmd to reproduce: gcc -m32 -msse4.2 -O2 pr65105.c -S -march=slm -fPIE Thanks, Ilya
void counter (long long l); void test (long long *arr) { register unsigned long long tmp; tmp = arr[0] | arr[1] & arr[2]; while (tmp) { counter (tmp); tmp = *(arr++) & tmp; } }
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index a607ef4..a9dbfea 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -2554,6 +2554,789 @@ rest_of_handle_insert_vzeroupper (void) return 0; } +static bool +has_non_address_hard_reg (rtx_insn *insn) +{ + df_ref ref; + FOR_EACH_INSN_DEF (ref, insn) + if (HARD_REGISTER_P (DF_REF_REAL_REG (ref)) + && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)) + return true; + + FOR_EACH_INSN_USE (ref, insn) + if (!DF_REF_REG_MEM_P(ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref))) + return true; + + return false; +} + +static bool +scalar_to_vector_candidate_p (rtx_insn *insn) +{ + rtx def_set = single_set (insn); + + if (!def_set) + return false; + + if (has_non_address_hard_reg (insn)) + return false; + + rtx src = SET_SRC (def_set); + rtx dst = SET_DEST (def_set); + + /* We are interested in DImode -> V1DI promotion + only. */ + if (GET_MODE (src) != DImode + || GET_MODE (dst) != DImode) + return false; + + if (!REG_P (dst) && !MEM_P (dst)) + return false; + + switch (GET_CODE (src)) + { + case PLUS: + case MINUS: + case IOR: + case XOR: + case AND: + break; + + default: + return false; + } + + if (!REG_P (XEXP (src, 0)) && !MEM_P (XEXP (src, 0))) + return false; + + if (!REG_P (XEXP (src, 1)) && !MEM_P (XEXP (src, 1))) + return false; + + if (GET_MODE (XEXP (src, 0)) != DImode + || GET_MODE (XEXP (src, 1)) != DImode) + return false; + + return true; +} + +/* Remove regs having both convertible and + not convertible definitions. */ +static void +remove_non_convertible_regs (bitmap insns) +{ + bitmap_iterator bi; + unsigned id; + bitmap regs = BITMAP_ALLOC (NULL); + + EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi) + { + rtx def_set = single_set (DF_INSN_UID_GET (id)->insn); + rtx reg = SET_DEST (def_set); + + if (!REG_P (reg) || bitmap_bit_p (regs, REGNO (reg))) + continue; + + for (df_ref def = DF_REG_DEF_CHAIN (REGNO (reg)); + def; + def = DF_REF_NEXT_REG (def)) + { + if (!bitmap_bit_p (insns, DF_REF_INSN_UID (def))) + { + if (dump_file) + fprintf (dump_file, + "r%d has non convertible definition in insn %d\n", + REGNO (reg), DF_REF_INSN_UID (def)); + + bitmap_set_bit (regs, REGNO (reg)); + break; + } + } + } + + EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi) + { + for (df_ref def = DF_REG_DEF_CHAIN (id); + def; + def = DF_REF_NEXT_REG (def)) + if (bitmap_bit_p (insns, DF_REF_INSN_UID (def))) + { + if (dump_file) + fprintf (dump_file, "Removing insn %d from candidates list\n", + DF_REF_INSN_UID (def)); + + bitmap_clear_bit (insns, DF_REF_INSN_UID (def)); + } + } + + BITMAP_FREE (regs); +} + +static bool +convertible_insn_p (rtx_insn *insn) +{ + rtx def_set = single_set (insn); + + if (!def_set) + return false; + + if (has_non_address_hard_reg (insn)) + return false; + + rtx src = SET_SRC (def_set); + rtx dst = SET_DEST (def_set); + + if (GET_MODE (src) != DImode + || GET_MODE (dst) != DImode) + return false; + + /* Convert simple loads, stores and register copies. */ + return ((REG_P (src) && MEM_P (dst)) + || ((MEM_P (src) && REG_P (dst))) + || ((REG_P (src) && REG_P (dst)))); +} + +class scalar_chain +{ + public: + scalar_chain (); + ~scalar_chain (); + + static unsigned max_id; + + unsigned int chain_id; + bitmap queue; + bitmap insns; + bitmap defs; + bitmap defs_conv; + + void build (bitmap candidates, unsigned insn_uid); + int compute_convert_gain (); + void convert (); + + private: + void add_insn (bitmap candidates, unsigned insn_uid); + void add_to_queue (unsigned insn_uid); + void mark_dual_mode_def (df_ref def); + void analyze_register_chain (bitmap candidates, df_ref ref); + void convert_insn (rtx_insn *insn); + void convert_op (rtx *op, rtx_insn *insn); + void convert_insn_defs (unsigned regno); + void make_scalar_copies (unsigned regno); + void make_vector_copies (unsigned regno); +}; + +unsigned scalar_chain::max_id = 0; + +scalar_chain::scalar_chain () +{ + chain_id = ++max_id; + + if (dump_file) + fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id); + + bitmap_obstack_initialize (NULL); + insns = BITMAP_ALLOC (NULL); + defs = BITMAP_ALLOC (NULL); + defs_conv = BITMAP_ALLOC (NULL); + queue = NULL; +} + +scalar_chain::~scalar_chain () +{ + BITMAP_FREE (insns); + BITMAP_FREE (defs); + BITMAP_FREE (defs_conv); + bitmap_obstack_release (NULL); +} + +void +scalar_chain::add_to_queue (unsigned insn_uid) +{ + if (bitmap_bit_p (insns, insn_uid) + || bitmap_bit_p (queue, insn_uid)) + return; + + if (dump_file) + fprintf (dump_file, " Adding insn %d into chain's #%d queue\n", + insn_uid, chain_id); + bitmap_set_bit (queue, insn_uid); +} + +void +scalar_chain::mark_dual_mode_def (df_ref def) +{ + gcc_assert (DF_REF_REG_DEF_P (def)); + + if (bitmap_bit_p (defs_conv, DF_REF_REGNO (def))) + return; + + if (dump_file) + fprintf (dump_file, + " Mark r%d def in insn %d as requiring both modes in chain #%d\n", + DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id); + + bitmap_set_bit (defs_conv, DF_REF_REGNO (def)); +} + +void +scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref) +{ + df_link *chain; + for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next) + { + unsigned uid = DF_REF_INSN_UID (chain->ref); + if (!DF_REF_REG_MEM_P (chain->ref)) + { + if (bitmap_bit_p (insns, uid)) + continue; + + if (bitmap_bit_p (candidates, uid)) + { + add_to_queue (uid); + continue; + } + + if (!DF_REF_REG_MEM_P (chain->ref) + && convertible_insn_p (DF_REF_INSN (chain->ref))) + { + if (dump_file) + fprintf (dump_file, " Mark insn %d as convertible\n", uid); + bitmap_set_bit (candidates, uid); + add_to_queue (uid); + continue; + } + } + + if (DF_REF_REG_DEF_P (chain->ref)) + { + if (dump_file) + fprintf (dump_file, " r%d def in insn %d isn't convertible\n", + DF_REF_REGNO (chain->ref), uid); + mark_dual_mode_def (chain->ref); + } + else + { + if (dump_file) + fprintf (dump_file, " r%d use in insn %d isn't convertible\n", + DF_REF_REGNO (chain->ref), uid); + mark_dual_mode_def (ref); + } + } +} + +void +scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid) +{ + if (bitmap_bit_p (insns, insn_uid)) + return; + + if (dump_file) + fprintf (dump_file, " Adding insn %d to chain #%d\n", insn_uid, chain_id); + + bitmap_set_bit (insns, insn_uid); + + rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn; + rtx def_set = single_set (insn); + if (def_set && REG_P (SET_DEST (def_set))) + bitmap_set_bit (defs, REGNO (SET_DEST (def_set))); + + df_ref ref; + for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref)) + analyze_register_chain (candidates, ref); + for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref)) + if (!DF_REF_REG_MEM_P (ref)) + analyze_register_chain (candidates, ref); +} + +void +scalar_chain::build (bitmap candidates, unsigned insn_uid) +{ + queue = BITMAP_ALLOC (NULL); + bitmap_set_bit (queue, insn_uid); + + if (dump_file) + fprintf (dump_file, "Building chain #%d...\n", chain_id); + + while (!bitmap_empty_p (queue)) + { + insn_uid = bitmap_first_set_bit (queue); + bitmap_clear_bit (queue, insn_uid); + bitmap_clear_bit (candidates, insn_uid); + add_insn (candidates, insn_uid); + } + + if (dump_file) + { + fprintf (dump_file, "Collected chain #%d...\n", chain_id); + fprintf (dump_file, " insns: "); + dump_bitmap (dump_file, insns); + if (!bitmap_empty_p (defs_conv)) + { + bitmap_iterator bi; + unsigned id; + const char *comma = ""; + fprintf (dump_file, " defs to convert: "); + EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi) + { + fprintf (dump_file, "%sr%d", comma, id); + comma = ", "; + } + fprintf (dump_file, "\n"); + } + } + + BITMAP_FREE (queue); +} + +int +scalar_chain::compute_convert_gain () +{ + bitmap_iterator bi; + unsigned insn_uid; + int gain = 0; + int cost = 0; + + if (dump_file) + fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id); + + EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi) + { + rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn; + rtx def_set = single_set (insn); + rtx src = SET_SRC (def_set); + rtx dst = SET_DEST (def_set); + + if (REG_P (src) && REG_P (dst)) + gain += COSTS_N_INSNS (2) - ix86_cost->sse_move; + else if (REG_P (src) && MEM_P (dst)) + gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1]; + else if (MEM_P (src) && REG_P (dst)) + gain += 2 * ix86_cost->int_load[2] - ix86_cost->sse_load[1]; + else if (GET_CODE (src) == PLUS + || GET_CODE (src) == MINUS + || GET_CODE (src) == IOR + || GET_CODE (src) == XOR + || GET_CODE (src) == AND) + gain += ix86_cost->add; + else + gcc_unreachable (); + } + + if (dump_file) + fprintf (dump_file, " Instruction convertion gain: %d\n", gain); + + EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, insn_uid, bi) + cost += DF_REG_DEF_COUNT (insn_uid) * ix86_cost->mmxsse_to_integer; + + if (dump_file) + fprintf (dump_file, " Registers convertion cost: %d\n", cost); + + gain -= cost; + + if (dump_file) + fprintf (dump_file, " Total gain: %d\n", gain); + + return gain; +} + +void +scalar_chain::make_vector_copies (unsigned regno) +{ + rtx reg = regno_reg_rtx[regno]; + rtx sireg = gen_reg_rtx (SImode); + rtx vcopy = gen_rtx_SUBREG (V2DImode, sireg, 0); + rtx vcopy1 = gen_rtx_SUBREG (V4SImode, sireg, 0); + df_ref ref; + + for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref)) + if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))) + { + start_sequence (); + if (TARGET_SSE4_1) + { + rtx tmp = gen_reg_rtx (SImode); + emit_move_insn (sireg, gen_rtx_SUBREG (SImode, reg, 0)); + emit_insn (gen_sse4_1_pinsrd (vcopy1, vcopy1, + gen_rtx_SUBREG (SImode, reg, 4), + GEN_INT (2))); + } + else + { + rtx tmp = gen_reg_rtx (SImode); + emit_move_insn (sireg, + gen_rtx_SUBREG (SImode, reg, 0)); + emit_move_insn (tmp, gen_rtx_SUBREG (SImode, reg, 4)); + emit_move_insn (gen_rtx_SUBREG (V2DImode, tmp, 0), + gen_rtx_ASHIFT (V2DImode, + gen_rtx_SUBREG (V2DImode, tmp, 0), + GEN_INT (32))); + emit_move_insn (vcopy, + gen_rtx_IOR (V2DImode, vcopy, + gen_rtx_SUBREG (V2DImode, tmp, 0))); + } + emit_insn_after (get_insns (), DF_REF_INSN (ref)); + end_sequence (); + + if (dump_file) + fprintf (dump_file, + " Copied r%d to a vector register r%d for insn %d\n", + regno, REGNO (sireg), DF_REF_INSN_UID (ref)); + } + + for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref)) + if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))) + { + replace_rtx (DF_REF_INSN (ref), reg, vcopy); + + if (dump_file) + fprintf (dump_file, " Replaced r%d with r%d in insn %d\n", + regno, REGNO (sireg), DF_REF_INSN_UID (ref)); + } +} + +void +scalar_chain::convert_insn_defs (unsigned regno) +{ + bool scalar_copy = bitmap_bit_p (defs_conv, regno); + rtx reg = regno_reg_rtx[regno]; + rtx new_reg = NULL_RTX; + rtx scopy = NULL_RTX; + df_ref ref; + bitmap conv; + + conv = BITMAP_ALLOC (NULL); + bitmap_copy (conv, insns); + + /* Check we have load or store. In this case we cannot + just convert register to V2DI mode and have to use subreg. */ + for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref)) + { + rtx def_set = single_set (DF_REF_INSN (ref)); + if (def_set && MEM_P (SET_SRC (def_set))) + { + new_reg = gen_rtx_SUBREG (V2DImode, reg, 0); + break; + } + } + + if (!new_reg) + for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref)) + { + rtx def_set = single_set (DF_REF_INSN (ref)); + if (def_set + && MEM_P (SET_DEST (def_set)) + && REG_P (SET_SRC (def_set))) + { + new_reg = gen_rtx_SUBREG (V2DImode, reg, 0); + break; + } + } + + if (!new_reg) + new_reg = gen_rtx_SUBREG (V2DImode, reg, 0); + + if (scalar_copy) + scopy = gen_reg_rtx (DImode); + + for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref)) + { + rtx def_set = single_set (DF_REF_INSN (ref)); + rtx src = SET_SRC (def_set); + rtx reg = DF_REF_REG (ref); + + if (!MEM_P (src)) + { + replace_rtx (DF_REF_INSN (ref), reg, new_reg); + bitmap_clear_bit (conv, DF_REF_INSN_UID (ref)); + } + + if (scalar_copy) + { + rtx vcopy = gen_reg_rtx (V2DImode); + + start_sequence (); + emit_move_insn (vcopy, new_reg); + emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0), + gen_rtx_SUBREG (SImode, vcopy, 0)); + emit_move_insn (vcopy, + gen_rtx_LSHIFTRT (V2DImode, vcopy, GEN_INT (32))); + emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4), + gen_rtx_SUBREG (SImode, vcopy, 0)); + emit_insn_after (get_insns (), DF_REF_INSN (ref)); + end_sequence (); + + if (dump_file) + fprintf (dump_file, + " Copied r%d to a scalar register r%d for insn %d\n", + regno, REGNO (scopy), DF_REF_INSN_UID (ref)); + } + } + + for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref)) + if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))) + { + if (bitmap_bit_p (conv, DF_REF_INSN_UID (ref))) + { + rtx def_set = single_set (DF_REF_INSN (ref)); + if (!MEM_P (SET_DEST (def_set)) + || !REG_P (SET_SRC (def_set))) + replace_rtx (DF_REF_INSN (ref), reg, new_reg); + bitmap_clear_bit (conv, DF_REF_INSN_UID (ref)); + } + } + else + { + replace_rtx (DF_REF_INSN (ref), reg, scopy); + df_insn_rescan (DF_REF_INSN (ref)); + } + + BITMAP_FREE (conv); +} + +void +scalar_chain::make_scalar_copies (unsigned regno) +{ + rtx scopy = gen_reg_rtx (DImode); + rtx vcopy = gen_reg_rtx (V2DImode); + df_ref ref; + + for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref)) + if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))) + { + rtx reg = DF_REF_REG (ref); + + start_sequence (); + emit_move_insn (vcopy, reg); + emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0), + gen_rtx_SUBREG (SImode, vcopy, 0)); + emit_move_insn (vcopy, + gen_rtx_LSHIFTRT (V2DImode, vcopy, GEN_INT (32))); + emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4), + gen_rtx_SUBREG (SImode, vcopy, 0)); + emit_insn_after (get_insns (), DF_REF_INSN (ref)); + end_sequence (); + + if (dump_file) + fprintf (dump_file, + " Copied r%d to a scalar register r%d for insn %d\n", + REGNO (reg), REGNO (scopy), DF_REF_INSN_UID (ref)); + } + + for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref)) + if (!bitmap_bit_p (insns, DF_REF_INSN_UID (ref))) + { + replace_rtx (DF_REF_INSN (ref), DF_REF_REG (ref), scopy); + df_insn_rescan (DF_REF_INSN (ref)); + + if (dump_file) + fprintf (dump_file, " Replaced r%d with r%d in insn %d\n", + regno, REGNO (scopy), DF_REF_INSN_UID (ref)); + } +} + +void +scalar_chain::convert_op (rtx *op, rtx_insn *insn) +{ + *op = copy_rtx_if_shared (*op); + + if (MEM_P (*op)) + { + rtx tmp = gen_reg_rtx (DImode); + rtx tmpv2di = gen_rtx_SUBREG (V2DImode, tmp, 0); + + emit_insn_before (gen_move_insn (tmp, *op), insn); + *op = tmpv2di; + + if (dump_file) + fprintf (dump_file, " Preloading operand for insn %d into r%d\n", + INSN_UID (insn), REGNO (tmp)); + } + else if (REG_P (*op)) + { + //*op = gen_rtx_SUBREG (V2DImode, *op, 0); + gcc_assert (bitmap_bit_p + (insns, DF_REF_INSN_UID (DF_REG_DEF_CHAIN (REGNO (*op)))) + || bitmap_bit_p (defs_conv, REGNO (*op))); + } + else + { + gcc_assert (SUBREG_P (*op)); + gcc_assert (GET_MODE (*op) == V2DImode); + } +} + +void +scalar_chain::convert_insn (rtx_insn *insn) +{ + rtx def_set = single_set (insn); + rtx src = copy_rtx_if_shared (SET_SRC (def_set)); + rtx dst = SET_DEST (def_set); + + if (MEM_P (dst)) + { + if (!REG_P (src)) + { + /* There are no scalar integer instructions and therefore + temporary register usage is required. */ + rtx tmp = gen_reg_rtx (DImode); + emit_insn_after (gen_move_insn (dst, tmp), insn); + dst = gen_rtx_SUBREG (V2DImode, tmp, 0); + } + } +#if 0 + else if (REG_P (dst)) + { + df_ref def = DF_REG_DEF_CHAIN (REGNO (dst)); + + if (MEM_P (src)) + { + df_link *link; + rtx subreg = gen_rtx_SUBREG (V2DImode, dst, 0); + for (link = DF_REF_CHAIN (def); link; link = link->next) + if (bitmap_bit_p (insns, DF_REF_INSN_UID (link->ref))) + { + /* replace_rtx dive into subreg and goes into recursion. */ + rtx tmp = gen_reg_rtx (VOIDmode); + replace_rtx (DF_REF_INSN (link->ref), dst, tmp); + replace_rtx (DF_REF_INSN (link->ref), tmp, subreg); + + if (dump_file) + fprintf (dump_file, " Replace r%d with a subreg in insn %d\n", + REGNO (dst), DF_REF_INSN_UID (link->ref)); + } + } + else + { + PUT_MODE (dst, V2DImode); + } + } + else + gcc_unreachable (); +#endif + + switch (GET_CODE (src)) + { + case PLUS: + case MINUS: + case IOR: + case XOR: + case AND: + convert_op (&XEXP (src, 0), insn); + convert_op (&XEXP (src, 1), insn); + PUT_MODE (src, V2DImode); + break; + + case MEM: + if (!REG_P (dst)) + convert_op (&src, insn); + break; + + case REG: + break; + + case SUBREG: + gcc_assert (GET_MODE (src) == V2DImode); + break; + + default: + gcc_unreachable (); + } + + SET_SRC (def_set) = src; + SET_DEST (def_set) = dst; + + /* Drop possible dead definitions. */ + PATTERN (insn) = def_set; + + INSN_CODE (insn) = -1; + recog_memoized (insn); + df_insn_rescan (insn); +} + +void +scalar_chain::convert () +{ + bitmap_iterator bi; + unsigned id; + + if (dump_file) + fprintf (dump_file, "Converting chain #%d...\n", chain_id); + + EXECUTE_IF_SET_IN_BITMAP (defs, 0, id, bi) + convert_insn_defs (id); + + EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi) + convert_insn (DF_INSN_UID_GET (id)->insn); + + EXECUTE_IF_AND_COMPL_IN_BITMAP (defs_conv, defs, 0, id, bi) + make_vector_copies (id); +} + +static unsigned int +convert_scalars_to_vector () +{ + basic_block bb; + bitmap candidates; + + bitmap_obstack_initialize (NULL); + candidates = BITMAP_ALLOC (NULL); + + calculate_dominance_info (CDI_DOMINATORS); + df_set_flags (DF_DEFER_INSN_RESCAN); + df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN); + df_md_add_problem (); + df_analyze (); + + /* 1. Find all instructions we want to convert into vector mode. */ + if (dump_file) + fprintf (dump_file, "Searching for mode convertion candidates...\n"); + + FOR_EACH_BB_FN (bb, cfun) + { + rtx_insn *insn; + FOR_BB_INSNS (bb, insn) + if (scalar_to_vector_candidate_p (insn)) + { + if (dump_file) + fprintf (dump_file, " insn %d is marked as a candidate\n", + INSN_UID (insn)); + + bitmap_set_bit (candidates, INSN_UID (insn)); + } + } + + remove_non_convertible_regs (candidates); + + if (bitmap_empty_p (candidates)) + if (dump_file) + fprintf (dump_file, "There are no candidates for optimization.\n"); + + while (!bitmap_empty_p (candidates)) + { + unsigned uid = bitmap_first_set_bit (candidates); + scalar_chain chain; + + /* Find instructions chain we want to convert to vector mode. + Check all uses and definitions to estimate all required + convertions. */ + chain.build (candidates, uid); + + if (chain.compute_convert_gain () > 0) + chain.convert (); + else + if (dump_file) + fprintf (dump_file, "Chain #%d convertion is not profitable\n", + chain.chain_id); + } + + BITMAP_FREE (candidates); + bitmap_obstack_release (NULL); + df_process_deferred_rescans (); + df_verify (); + + return 0; +} + namespace { const pass_data pass_data_insert_vzeroupper = @@ -2591,6 +3374,39 @@ public: }; // class pass_insert_vzeroupper +const pass_data pass_data_stv = +{ + RTL_PASS, /* type */ + "stv", /* name */ + OPTGROUP_NONE, /* optinfo_flags */ + TV_NONE, /* tv_id */ + 0, /* properties_required */ + 0, /* properties_provided */ + 0, /* properties_destroyed */ + 0, /* todo_flags_start */ + TODO_df_finish, /* todo_flags_finish */ +}; + +class pass_stv : public rtl_opt_pass +{ +public: + pass_stv (gcc::context *ctxt) + : rtl_opt_pass (pass_data_stv, ctxt) + {} + + /* opt_pass methods: */ + virtual bool gate (function *) + { + return !TARGET_64BIT && TARGET_SSE2 && optimize > 1; + } + + virtual unsigned int execute (function *) + { + return convert_scalars_to_vector (); + } + +}; // class pass_stv + } // anon namespace rtl_opt_pass * @@ -2599,6 +3415,12 @@ make_pass_insert_vzeroupper (gcc::context *ctxt) return new pass_insert_vzeroupper (ctxt); } +rtl_opt_pass * +make_pass_stv (gcc::context *ctxt) +{ + return new pass_stv (ctxt); +} + /* Return true if a red-zone is in use. */ static inline bool @@ -4363,12 +5185,18 @@ ix86_option_override (void) = { pass_insert_vzeroupper, "reload", 1, PASS_POS_INSERT_AFTER }; + opt_pass *pass_stv = make_pass_stv (g); + struct register_pass_info stv_info + = { pass_stv, "combine", + 1, PASS_POS_INSERT_AFTER + }; ix86_option_override_internal (true, &global_options, &global_options_set); /* This needs to be done at start up. It's convenient to do it here. */ register_pass (&insert_vzeroupper_info); + register_pass (&stv_info); } /* Implement the TARGET_OFFLOAD_OPTIONS hook. */ diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 7195882..6aae22c 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -972,6 +972,11 @@ (HI "TARGET_HIMODE_MATH") SI]) +;; Math-dependant integer modes with DImode. +(define_mode_iterator SWIM1248x [(QI "TARGET_QIMODE_MATH") + (HI "TARGET_HIMODE_MATH") + SI DI]) + ;; Math-dependant single word integer modes without QImode. (define_mode_iterator SWIM248 [(HI "TARGET_HIMODE_MATH") SI (DI "TARGET_64BIT")]) @@ -7731,9 +7736,9 @@ ;; it should be done with splitters. (define_expand "and<mode>3" - [(set (match_operand:SWIM 0 "nonimmediate_operand") - (and:SWIM (match_operand:SWIM 1 "nonimmediate_operand") - (match_operand:SWIM 2 "<general_szext_operand>")))] + [(set (match_operand:SWIM1248x 0 "nonimmediate_operand") + (and:SWIM1248x (match_operand:SWIM1248x 1 "nonimmediate_operand") + (match_operand:SWIM1248x 2 "<general_szext_operand>")))] "" { machine_mode mode = <MODE>mode; @@ -7811,6 +7816,43 @@ (const_string "*"))) (set_attr "mode" "SI,DI,DI,SI,DI")]) +(define_insn_and_split "*anddi3_doubleword" + [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r") + (and:DI + (match_operand:DI 1 "nonimmediate_operand" "%0,0,0") + (match_operand:DI 2 "x86_64_szext_general_operand" "Z,re,rm"))) + (clobber (reg:CC FLAGS_REG))] + "!TARGET_64BIT && ix86_binary_operator_ok (AND, DImode, operands)" + "#" + "!TARGET_64BIT && reload_completed" + [(parallel [(set (match_dup 0) + (and:SI (match_dup 1) (match_dup 2))) + (clobber (reg:CC FLAGS_REG))]) + (parallel [(set (match_dup 3) + (and:SI (match_dup 4) (match_dup 5))) + (clobber (reg:CC FLAGS_REG))])] + "split_double_mode (DImode, &operands[0], 3, &operands[0], &operands[3]);") + +(define_insn_and_split "*zext<mode>_doubleword" + [(set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI (match_operand:SWI24 1 "nonimmediate_operand" "rm")))] + "!TARGET_64BIT" + "#" + "!TARGET_64BIT && reload_completed" + [(set (match_dup 0) (zero_extend:SI (match_dup 1))) + (set (match_dup 2) (const_int 0))] + "split_double_mode (DImode, &operands[0], 1, &operands[0], &operands[2]);") + +(define_insn_and_split "*zextqi_doubleword" + [(set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI (match_operand:QI 1 "nonimmediate_operand" "qm")))] + "!TARGET_64BIT" + "#" + "!TARGET_64BIT && reload_completed" + [(set (match_dup 0) (zero_extend:SI (match_dup 1))) + (set (match_dup 2) (const_int 0))] + "split_double_mode (DImode, &operands[0], 1, &operands[0], &operands[2]);") + (define_insn "*andsi_1" [(set (match_operand:SI 0 "nonimmediate_operand" "=rm,r,Ya,!k") (and:SI (match_operand:SI 1 "nonimmediate_operand" "%0,0,qm,k") @@ -8298,9 +8340,9 @@ ;; If this is considered useful, it should be done with splitters. (define_expand "<code><mode>3" - [(set (match_operand:SWIM 0 "nonimmediate_operand") - (any_or:SWIM (match_operand:SWIM 1 "nonimmediate_operand") - (match_operand:SWIM 2 "<general_operand>")))] + [(set (match_operand:SWIM1248x 0 "nonimmediate_operand") + (any_or:SWIM1248x (match_operand:SWIM1248x 1 "nonimmediate_operand") + (match_operand:SWIM1248x 2 "<general_operand>")))] "" "ix86_expand_binary_operator (<CODE>, <MODE>mode, operands); DONE;") @@ -8318,6 +8360,23 @@ [(set_attr "type" "alu,alu,msklog") (set_attr "mode" "<MODE>")]) +(define_insn_and_split "*<code>di3_doubleword" + [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r") + (any_or:DI + (match_operand:DI 1 "nonimmediate_operand" "%0,0,0") + (match_operand:DI 2 "x86_64_szext_general_operand" "Z,re,rm"))) + (clobber (reg:CC FLAGS_REG))] + "!TARGET_64BIT && ix86_binary_operator_ok (<CODE>, DImode, operands)" + "#" + "!TARGET_64BIT && reload_completed" + [(parallel [(set (match_dup 0) + (any_or:SI (match_dup 1) (match_dup 2))) + (clobber (reg:CC FLAGS_REG))]) + (parallel [(set (match_dup 3) + (any_or:SI (match_dup 4) (match_dup 5))) + (clobber (reg:CC FLAGS_REG))])] + "split_double_mode (DImode, &operands[0], 3, &operands[0], &operands[3]);") + (define_insn "*<code>hi_1" [(set (match_operand:HI 0 "nonimmediate_operand" "=r,rm,!k") (any_or:HI