Hi Jan, Here is a patch we've talked about recently - it merges expanders of memset and memmov. As a natural side effect, this enables vector_loop in memset expanding as well.
Though in some places merging movmem and setmem isn't so efficient (the original code in these versions differed a lot), I think it's worth combining them - in many cases that allows to remove code duplicates. I tried to keep the resultant code as close to the original as possible (except enabling vector_loop in setmem). Because of that, there are some places that could be IMHO merged, but not merged in this patch. The patch is bootstrapped and tested on i386/x86_64 (make check, and stability testing on Spec2k, Spec2k6). Is it ok? Michael --- gcc/config/i386/i386.c | 1018 ++++++++------------ .../gcc.target/i386/memset-vector_loop-1.c | 11 + .../gcc.target/i386/memset-vector_loop-2.c | 10 + 3 files changed, 406 insertions(+), 633 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/memset-vector_loop-1.c create mode 100644 gcc/testsuite/gcc.target/i386/memset-vector_loop-2.c diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 21fc531..9d5654f 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -22219,13 +22219,16 @@ expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem, emit_label (out_label); } -/* Output "rep; mov" instruction. - Arguments have same meaning as for previous function */ +/* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument. + When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored. + When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored. + Other arguments have same meaning as for previous function. */ + static void -expand_movmem_via_rep_mov (rtx destmem, rtx srcmem, - rtx destptr, rtx srcptr, +expand_movmem_or_setmem_via_rep (rtx destmem, rtx srcmem, + rtx destptr, rtx srcptr, rtx value, rtx orig_value, rtx count, - enum machine_mode mode) + enum machine_mode mode, bool issetmem) { rtx destexp; rtx srcexp; @@ -22233,82 +22236,65 @@ expand_movmem_via_rep_mov (rtx destmem, rtx srcmem, HOST_WIDE_INT rounded_count; /* If the size is known, it is shorter to use rep movs. */ - if (mode == QImode && CONST_INT_P (count) + if (!issetmem && mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)) mode = SImode; if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode) destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0); - if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode) - srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0); - countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode))); + + countreg = ix86_zero_extend_to_Pmode (scale_counter (count, + GET_MODE_SIZE (mode))); if (mode != QImode) { destexp = gen_rtx_ASHIFT (Pmode, countreg, GEN_INT (exact_log2 (GET_MODE_SIZE (mode)))); destexp = gen_rtx_PLUS (Pmode, destexp, destptr); - srcexp = gen_rtx_ASHIFT (Pmode, countreg, - GEN_INT (exact_log2 (GET_MODE_SIZE (mode)))); - srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr); } else - { - destexp = gen_rtx_PLUS (Pmode, destptr, countreg); - srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg); - } - if (CONST_INT_P (count)) + destexp = gen_rtx_PLUS (Pmode, destptr, countreg); + if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count)) { rounded_count = (INTVAL (count) & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1)); destmem = shallow_copy_rtx (destmem); - srcmem = shallow_copy_rtx (srcmem); set_mem_size (destmem, rounded_count); - set_mem_size (srcmem, rounded_count); - } - else - { - if (MEM_SIZE_KNOWN_P (destmem)) - clear_mem_size (destmem); - if (MEM_SIZE_KNOWN_P (srcmem)) - clear_mem_size (srcmem); } - emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg, - destexp, srcexp)); -} - -/* Output "rep; stos" instruction. - Arguments have same meaning as for previous function */ -static void -expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value, - rtx count, enum machine_mode mode, - rtx orig_value) -{ - rtx destexp; - rtx countreg; - HOST_WIDE_INT rounded_count; + else if (MEM_SIZE_KNOWN_P (destmem)) + clear_mem_size (destmem); - if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode) - destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0); - value = force_reg (mode, gen_lowpart (mode, value)); - countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode))); - if (mode != QImode) + if (issetmem) { - destexp = gen_rtx_ASHIFT (Pmode, countreg, - GEN_INT (exact_log2 (GET_MODE_SIZE (mode)))); - destexp = gen_rtx_PLUS (Pmode, destexp, destptr); + value = force_reg (mode, gen_lowpart (mode, value)); + emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp)); } else - destexp = gen_rtx_PLUS (Pmode, destptr, countreg); - if (orig_value == const0_rtx && CONST_INT_P (count)) { - rounded_count = (INTVAL (count) - & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1)); - destmem = shallow_copy_rtx (destmem); - set_mem_size (destmem, rounded_count); + if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode) + srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0); + if (mode != QImode) + { + srcexp = gen_rtx_ASHIFT (Pmode, countreg, + GEN_INT (exact_log2 (GET_MODE_SIZE (mode)))); + srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr); + } + else + srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg); + if (CONST_INT_P (count)) + { + rounded_count = (INTVAL (count) + & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1)); + srcmem = shallow_copy_rtx (srcmem); + set_mem_size (srcmem, rounded_count); + } + else + { + if (MEM_SIZE_KNOWN_P (srcmem)) + clear_mem_size (srcmem); + } + emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg, + destexp, srcexp)); } - else if (MEM_SIZE_KNOWN_P (destmem)) - clear_mem_size (destmem); - emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp)); } /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to @@ -22496,6 +22482,57 @@ expand_movmem_epilogue (rtx destmem, rtx srcmem, } } +/* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM + with value PROMOTED_VAL. + SRC is passed by pointer to be updated on return. + Return value is updated DST. */ +static rtx +emit_memset (rtx destmem, rtx destptr, rtx promoted_val, + HOST_WIDE_INT size_to_move) +{ + rtx dst = destmem, adjust; + enum insn_code code; + enum machine_mode move_mode; + int piece_size, i; + + /* Find the widest mode in which we could perform moves. + Start with the biggest power of 2 less than SIZE_TO_MOVE and half + it until move of such size is supported. */ + move_mode = GET_MODE (promoted_val); + if (size_to_move < GET_MODE_SIZE (move_mode)) + { + move_mode = mode_for_size (size_to_move * BITS_PER_UNIT, MODE_INT, 0); + promoted_val = gen_lowpart (move_mode, promoted_val); + } + piece_size = GET_MODE_SIZE (move_mode); + code = optab_handler (mov_optab, move_mode); + gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX); + + dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0); + + /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */ + gcc_assert (size_to_move % piece_size == 0); + adjust = GEN_INT (piece_size); + for (i = 0; i < size_to_move; i += piece_size) + { + if (piece_size <= GET_MODE_SIZE (word_mode)) + { + emit_insn (gen_strset (destptr, dst, promoted_val)); + continue; + } + + emit_insn (GEN_FCN (code) (dst, promoted_val)); + + emit_move_insn (destptr, + gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust)); + + dst = adjust_automodify_address_nv (dst, move_mode, destptr, + piece_size); + } + + /* Update DST rtx. */ + return dst; +} /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */ static void expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value, @@ -22511,61 +22548,30 @@ expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value, /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */ static void -expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size) +expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value, + rtx count, int max_size) { rtx dest; if (CONST_INT_P (count)) { HOST_WIDE_INT countval = INTVAL (count); - int offset = 0; + HOST_WIDE_INT epilogue_size = countval % max_size; + int i; - if ((countval & 0x10) && max_size > 16) - { - if (TARGET_64BIT) - { - dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset); - emit_insn (gen_strset (destptr, dest, value)); - dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8); - emit_insn (gen_strset (destptr, dest, value)); - } - else - gcc_unreachable (); - offset += 16; - } - if ((countval & 0x08) && max_size > 8) + /* For now MAX_SIZE should be a power of 2. This assert could be + relaxed, but it'll require a bit more complicated epilogue + expanding. */ + gcc_assert ((max_size & (max_size - 1)) == 0); + for (i = max_size; i >= 1; i >>= 1) { - if (TARGET_64BIT) - { - dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset); - emit_insn (gen_strset (destptr, dest, value)); - } - else + if (epilogue_size & i) { - dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset); - emit_insn (gen_strset (destptr, dest, value)); - dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4); - emit_insn (gen_strset (destptr, dest, value)); + if (vec_value && i > GET_MODE_SIZE (GET_MODE (value))) + destmem = emit_memset (destmem, destptr, vec_value, i); + else + destmem = emit_memset (destmem, destptr, value, i); } - offset += 8; - } - if ((countval & 0x04) && max_size > 4) - { - dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset); - emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value))); - offset += 4; - } - if ((countval & 0x02) && max_size > 2) - { - dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset); - emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value))); - offset += 2; - } - if ((countval & 0x01) && max_size > 1) - { - dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset); - emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value))); - offset += 1; } return; } @@ -22637,13 +22643,16 @@ expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_ } } -/* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to - DESIRED_ALIGNMENT. +/* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to + DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN. + Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are + ignored. Return value is updated DESTMEM. */ static rtx -expand_movmem_prologue (rtx destmem, rtx srcmem, - rtx destptr, rtx srcptr, rtx count, - int align, int desired_alignment) +expand_movmem_or_setmem_prologue (rtx destmem, rtx srcmem, + rtx destptr, rtx srcptr, rtx value, + rtx vec_value, rtx count, int align, + int desired_alignment, bool issetmem) { int i; for (i = 1; i < desired_alignment; i <<= 1) @@ -22651,7 +22660,15 @@ expand_movmem_prologue (rtx destmem, rtx srcmem, if (align <= i) { rtx label = ix86_expand_aligntest (destptr, i, false); - destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i); + if (issetmem) + { + if (vec_value && i > GET_MODE_SIZE (GET_MODE (value))) + destmem = emit_memset (destmem, destptr, vec_value, i); + else + destmem = emit_memset (destmem, destptr, value, i); + } + else + destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i); ix86_adjust_counter (count, i); emit_label (label); LABEL_NUSES (label) = 1; @@ -22661,22 +22678,28 @@ expand_movmem_prologue (rtx destmem, rtx srcmem, return destmem; } -/* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN. - ALIGN_BYTES is how many bytes need to be copied. - The function updates DST and SRC, namely, it sets proper alignment. - DST is returned via return value, SRC is updated via pointer SRCP. */ +/* This function is like the previous one, except here we know how many bytes + need to be copied. That allows us to update alignment not only of DST, which + is returned, but also of SRC, which is passed as a pointer for that + reason. */ static rtx -expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg, - int desired_align, int align_bytes) +expand_constant_movmem_or_setmem_prologue (rtx dst, rtx *srcp, rtx destreg, + rtx srcreg, rtx value, rtx vec_value, + int desired_align, int align_bytes, + bool issetmem) { - rtx src = *srcp; + rtx src = NULL; rtx orig_dst = dst; - rtx orig_src = src; + rtx orig_src = NULL; int piece_size = 1; int copied_bytes = 0; - int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT); - if (src_align_bytes >= 0) - src_align_bytes = desired_align - src_align_bytes; + + if (!issetmem) + { + gcc_assert (srcp != NULL); + src = *srcp; + orig_src = src; + } for (piece_size = 1; piece_size <= desired_align && copied_bytes < align_bytes; @@ -22684,109 +22707,48 @@ expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg, { if (align_bytes & piece_size) { - dst = emit_memmov (dst, &src, destreg, srcreg, piece_size); + if (issetmem) + { + if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value))) + dst = emit_memset (dst, destreg, vec_value, piece_size); + else + dst = emit_memset (dst, destreg, value, piece_size); + } + else + dst = emit_memmov (dst, &src, destreg, srcreg, piece_size); copied_bytes += piece_size; } } - if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT) set_mem_align (dst, desired_align * BITS_PER_UNIT); - if (src_align_bytes >= 0) - { - unsigned int src_align; - for (src_align = desired_align; src_align >= 2; src_align >>= 1) - { - if ((src_align_bytes & (src_align - 1)) - == (align_bytes & (src_align - 1))) - break; - } - if (src_align > (unsigned int) desired_align) - src_align = desired_align; - if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT) - set_mem_align (src, src_align * BITS_PER_UNIT); - } if (MEM_SIZE_KNOWN_P (orig_dst)) set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes); - if (MEM_SIZE_KNOWN_P (orig_src)) - set_mem_size (src, MEM_SIZE (orig_src) - align_bytes); - *srcp = src; - return dst; -} -/* Set enough from DEST to align DEST known to by aligned by ALIGN to - DESIRED_ALIGNMENT. */ -static void -expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count, - int align, int desired_alignment) -{ - if (align <= 1 && desired_alignment > 1) - { - rtx label = ix86_expand_aligntest (destptr, 1, false); - destmem = change_address (destmem, QImode, destptr); - emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value))); - ix86_adjust_counter (count, 1); - emit_label (label); - LABEL_NUSES (label) = 1; - } - if (align <= 2 && desired_alignment > 2) + if (!issetmem) { - rtx label = ix86_expand_aligntest (destptr, 2, false); - destmem = change_address (destmem, HImode, destptr); - emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value))); - ix86_adjust_counter (count, 2); - emit_label (label); - LABEL_NUSES (label) = 1; - } - if (align <= 4 && desired_alignment > 4) - { - rtx label = ix86_expand_aligntest (destptr, 4, false); - destmem = change_address (destmem, SImode, destptr); - emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value))); - ix86_adjust_counter (count, 4); - emit_label (label); - LABEL_NUSES (label) = 1; + int src_align_bytes = get_mem_align_offset (src, desired_align + * BITS_PER_UNIT); + if (src_align_bytes >= 0) + src_align_bytes = desired_align - src_align_bytes; + if (src_align_bytes >= 0) + { + unsigned int src_align; + for (src_align = desired_align; src_align >= 2; src_align >>= 1) + { + if ((src_align_bytes & (src_align - 1)) + == (align_bytes & (src_align - 1))) + break; + } + if (src_align > (unsigned int) desired_align) + src_align = desired_align; + if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT) + set_mem_align (src, src_align * BITS_PER_UNIT); + } + if (MEM_SIZE_KNOWN_P (orig_src)) + set_mem_size (src, MEM_SIZE (orig_src) - align_bytes); + *srcp = src; } - gcc_assert (desired_alignment <= 8); -} -/* Set enough from DST to align DST known to by aligned by ALIGN to - DESIRED_ALIGN. ALIGN_BYTES is how many bytes need to be stored. */ -static rtx -expand_constant_setmem_prologue (rtx dst, rtx destreg, rtx value, - int desired_align, int align_bytes) -{ - int off = 0; - rtx orig_dst = dst; - if (align_bytes & 1) - { - dst = adjust_automodify_address_nv (dst, QImode, destreg, 0); - off = 1; - emit_insn (gen_strset (destreg, dst, - gen_lowpart (QImode, value))); - } - if (align_bytes & 2) - { - dst = adjust_automodify_address_nv (dst, HImode, destreg, off); - if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT) - set_mem_align (dst, 2 * BITS_PER_UNIT); - off = 2; - emit_insn (gen_strset (destreg, dst, - gen_lowpart (HImode, value))); - } - if (align_bytes & 4) - { - dst = adjust_automodify_address_nv (dst, SImode, destreg, off); - if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT) - set_mem_align (dst, 4 * BITS_PER_UNIT); - off = 4; - emit_insn (gen_strset (destreg, dst, - gen_lowpart (SImode, value))); - } - dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off); - if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT) - set_mem_align (dst, desired_align * BITS_PER_UNIT); - if (MEM_SIZE_KNOWN_P (orig_dst)) - set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes); return dst; } @@ -22961,10 +22923,104 @@ decide_alignment (int align, return desired_align; } -/* Expand string move (memcpy) operation. Use i386 string operations - when profitable. expand_setmem contains similar code. The code - depends upon architecture, block size and alignment, but always has - the same overall structure: + +/* Helper function for memcpy. For QImode value 0xXY produce + 0xXYXYXYXY of wide specified by MODE. This is essentially + a * 0x10101010, but we can do slightly better than + synth_mult by unwinding the sequence by hand on CPUs with + slow multiply. */ +static rtx +promote_duplicated_reg (enum machine_mode mode, rtx val) +{ + enum machine_mode valmode = GET_MODE (val); + rtx tmp; + int nops = mode == DImode ? 3 : 2; + + gcc_assert (mode == SImode || mode == DImode || val == const0_rtx); + if (val == const0_rtx) + return copy_to_mode_reg (mode, CONST0_RTX (mode)); + if (CONST_INT_P (val)) + { + HOST_WIDE_INT v = INTVAL (val) & 255; + + v |= v << 8; + v |= v << 16; + if (mode == DImode) + v |= (v << 16) << 16; + return copy_to_mode_reg (mode, gen_int_mode (v, mode)); + } + + if (valmode == VOIDmode) + valmode = QImode; + if (valmode != QImode) + val = gen_lowpart (QImode, val); + if (mode == QImode) + return val; + if (!TARGET_PARTIAL_REG_STALL) + nops--; + if (ix86_cost->mult_init[mode == DImode ? 3 : 2] + + ix86_cost->mult_bit * (mode == DImode ? 8 : 4) + <= (ix86_cost->shift_const + ix86_cost->add) * nops + + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0))) + { + rtx reg = convert_modes (mode, QImode, val, true); + tmp = promote_duplicated_reg (mode, const1_rtx); + return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1, + OPTAB_DIRECT); + } + else + { + rtx reg = convert_modes (mode, QImode, val, true); + + if (!TARGET_PARTIAL_REG_STALL) + if (mode == SImode) + emit_insn (gen_movsi_insv_1 (reg, reg)); + else + emit_insn (gen_movdi_insv_1 (reg, reg)); + else + { + tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8), + NULL, 1, OPTAB_DIRECT); + reg = + expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT); + } + tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16), + NULL, 1, OPTAB_DIRECT); + reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT); + if (mode == SImode) + return reg; + tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32), + NULL, 1, OPTAB_DIRECT); + reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT); + return reg; + } +} + +/* Duplicate value VAL using promote_duplicated_reg into maximal size that will + be needed by main loop copying SIZE_NEEDED chunks and prologue getting + alignment from ALIGN to DESIRED_ALIGN. */ +static rtx +promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, + int align) +{ + rtx promoted_val; + + if (TARGET_64BIT + && (size_needed > 4 || (desired_align > align && desired_align > 4))) + promoted_val = promote_duplicated_reg (DImode, val); + else if (size_needed > 2 || (desired_align > align && desired_align > 2)) + promoted_val = promote_duplicated_reg (SImode, val); + else if (size_needed > 1 || (desired_align > align && desired_align > 1)) + promoted_val = promote_duplicated_reg (HImode, val); + else + promoted_val = val; + + return promoted_val; +} + +/* Expand string move (memcpy) ot store (memset) operation. Use i386 string + operations when profitable. The code depends upon architecture, block size + and alignment, but always has the same overall structure: 1) Prologue guard: Conditional that jumps up to epilogues for small blocks that can be handled by epilogue alone. This is faster @@ -22974,24 +23030,24 @@ decide_alignment (int align, Optional dynamic check for size and libcall for large blocks is emitted here too, with -minline-stringops-dynamically. - 2) Prologue: copy first few bytes in order to get destination + 2) Prologue: copy/set first few bytes in order to get destination aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be - copied. We emit either a jump tree on power of two sized + copied/set. We emit either a jump tree on power of two sized blocks, or a byte loop. - 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks - with specified algorithm. + 3) Main body: the copying/storing loop itself, copying/storing in SIZE_NEEDED + chunks with specified algorithm. - 4) Epilogue: code copying tail of the block that is too small to be + 4) Epilogue: code copying/storing tail of the block that is too small to be handled by main body (or up to size guarded by prologue guard). */ - -bool -ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp, - rtx expected_align_exp, rtx expected_size_exp) +static bool +ix86_expand_movmem_or_setmem (rtx dst, rtx src, rtx count_exp, rtx val_exp, + rtx align_exp, rtx expected_align_exp, + rtx expected_size_exp, bool issetmem) { rtx destreg; - rtx srcreg; + rtx srcreg = NULL; rtx label = NULL; rtx tmp; rtx jump_around_label = NULL; @@ -23001,6 +23057,9 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp, int size_needed = 0, epilogue_size_needed; int desired_align = 0, align_bytes = 0; enum stringop_alg alg; + rtx promoted_val = NULL; + rtx vec_promoted_val = NULL; + bool force_loopy_epilogue = false; int dynamic_check; bool need_zero_guard = false; bool noalign; @@ -23015,7 +23074,8 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp, align = INTVAL (expected_align_exp); /* ALIGN is the minimum of destination and source alignment, but we care here just about destination alignment. */ - else if (MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT) + else if (!issetmem + && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT) align = MEM_ALIGN (dst) / BITS_PER_UNIT; if (CONST_INT_P (count_exp)) @@ -23029,15 +23089,21 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp, /* Step 0: Decide on preferred algorithm, desired alignment and size of chunks to be copied by main loop. */ - alg = decide_alg (count, expected_size, false, &dynamic_check, &noalign); + alg = decide_alg (count, expected_size, issetmem, &dynamic_check, &noalign); if (alg == libcall) return false; gcc_assert (alg != no_stringop); + /* For now vector-version of memset is generated only for memory zeroing, as + creating of promoted vector value is very cheap in this case. */ + if (issetmem && alg == vector_loop && val_exp != const0_rtx) + alg = unrolled_loop; + if (!count) count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp); destreg = copy_addr_to_reg (XEXP (dst, 0)); - srcreg = copy_addr_to_reg (XEXP (src, 0)); + if (!issetmem) + srcreg = copy_addr_to_reg (XEXP (src, 0)); unroll_factor = 1; move_mode = word_mode; @@ -23115,14 +23181,39 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp, } gcc_assert (desired_align >= 1 && align >= 1); - /* Ensure that alignment prologue won't copy past end of block. */ - if (size_needed > 1 || (desired_align > 1 && desired_align > align)) - { - epilogue_size_needed = MAX (size_needed - 1, desired_align - align); + /* Do the cheap promotion to allow better CSE across the + main loop and epilogue (ie one load of the big constant in the + front of all code. */ + if (issetmem && CONST_INT_P (val_exp)) + { + if (alg == vector_loop) + { + gcc_assert (val_exp == const0_rtx); + vec_promoted_val = promote_duplicated_reg (move_mode, val_exp); + promoted_val = promote_duplicated_reg_to_size (val_exp, + GET_MODE_SIZE (word_mode), + desired_align, align); + } + else + { + promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed, + desired_align, align); + } + } + /* Ensure that alignment prologue won't copy past end of block. */ + if (size_needed > 1 || (desired_align > 1 && desired_align > align)) + { + epilogue_size_needed = MAX (size_needed - 1, desired_align - align); /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes. Make sure it is power of 2. */ epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1); + /* To improve performance of small blocks, we jump around the VAL + promoting mode. This mean that if the promoted VAL is not constant, + we might not use it in the epilogue and have to use byte + loop variant. */ + if (issetmem && epilogue_size_needed > 2 && !promoted_val) + force_loopy_epilogue = true; if (count) { if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed) @@ -23152,7 +23243,7 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp, used. */ if (dynamic_check != -1) { - if (CONST_INT_P (count_exp)) + if (!issetmem && CONST_INT_P (count_exp)) { if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check) { @@ -23168,13 +23259,20 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp, emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1), LEU, 0, GET_MODE (count_exp), 1, hot_label); predict_jump (REG_BR_PROB_BASE * 90 / 100); - emit_block_move_via_libcall (dst, src, count_exp, false); + if (issetmem) + set_storage_via_libcall (dst, count_exp, val_exp, false); + else + emit_block_move_via_libcall (dst, src, count_exp, false); emit_jump (jump_around_label); emit_label (hot_label); } } /* Step 2: Alignment prologue. */ + /* Do the expensive promotion once we branched off the small blocks. */ + if (issetmem && !promoted_val) + promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed, + desired_align, align); if (desired_align > align) { @@ -23184,17 +23282,26 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp, constant offset in aliasing info. It don't seems to worth the pain to maintain it for the first move, so throw away the info early. */ - src = change_address (src, BLKmode, srcreg); dst = change_address (dst, BLKmode, destreg); - dst = expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align, - desired_align); + if (!issetmem) + src = change_address (src, BLKmode, srcreg); + dst = expand_movmem_or_setmem_prologue (dst, src, destreg, srcreg, + promoted_val, vec_promoted_val, + count_exp, align, desired_align, + issetmem); } else { /* If we know how many bytes need to be stored before dst is sufficiently aligned, maintain aliasing info accurately. */ - dst = expand_constant_movmem_prologue (dst, &src, destreg, srcreg, - desired_align, align_bytes); + dst = expand_constant_movmem_or_setmem_prologue (dst, &src, destreg, + srcreg, + promoted_val, + vec_promoted_val, + desired_align, + align_bytes, + issetmem); + count_exp = plus_constant (counter_mode (count_exp), count_exp, -align_bytes); count -= align_bytes; @@ -23226,6 +23333,8 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp, LABEL_NUSES (label) = 1; label = NULL; epilogue_size_needed = 1; + if (issetmem) + promoted_val = val_exp; } else if (label == NULL_RTX) epilogue_size_needed = size_needed; @@ -23241,29 +23350,35 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp, case loop_1_byte: case loop: case unrolled_loop: - case vector_loop: - expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL, + expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, promoted_val, count_exp, move_mode, unroll_factor, expected_size); break; + case vector_loop: + expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, + vec_promoted_val, count_exp, move_mode, + unroll_factor, expected_size); + break; case rep_prefix_8_byte: case rep_prefix_4_byte: case rep_prefix_1_byte: - expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp, - move_mode); + expand_movmem_or_setmem_via_rep (dst, src, destreg, srcreg, promoted_val, + val_exp, count_exp, move_mode, issetmem); break; } /* Adjust properly the offset of src and dest memory for aliasing. */ if (CONST_INT_P (count_exp)) { - src = adjust_automodify_address_nv (src, BLKmode, srcreg, - (count / size_needed) * size_needed); + if (!issetmem) + src = adjust_automodify_address_nv (src, BLKmode, srcreg, + (count / size_needed) * size_needed); dst = adjust_automodify_address_nv (dst, BLKmode, destreg, (count / size_needed) * size_needed); } else { - src = change_address (src, BLKmode, srcreg); + if (!issetmem) + src = change_address (src, BLKmode, srcreg); dst = change_address (dst, BLKmode, destreg); } @@ -23272,7 +23387,7 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp, if (label) { /* When the main loop is done, COUNT_EXP might hold original count, - while we want to copy only COUNT_EXP & SIZE_NEEDED bytes. + while we want to copy only COUNT_EXP & SIZE_NEEDED bytes. Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes. Compensate if needed. */ @@ -23290,408 +23405,45 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp, } if (count_exp != const0_rtx && epilogue_size_needed > 1) - expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp, - epilogue_size_needed); - if (jump_around_label) - emit_label (jump_around_label); - return true; -} - -/* Helper function for memcpy. For QImode value 0xXY produce - 0xXYXYXYXY of wide specified by MODE. This is essentially - a * 0x10101010, but we can do slightly better than - synth_mult by unwinding the sequence by hand on CPUs with - slow multiply. */ -static rtx -promote_duplicated_reg (enum machine_mode mode, rtx val) -{ - enum machine_mode valmode = GET_MODE (val); - rtx tmp; - int nops = mode == DImode ? 3 : 2; - - gcc_assert (mode == SImode || mode == DImode); - if (val == const0_rtx) - return copy_to_mode_reg (mode, const0_rtx); - if (CONST_INT_P (val)) { - HOST_WIDE_INT v = INTVAL (val) & 255; - - v |= v << 8; - v |= v << 16; - if (mode == DImode) - v |= (v << 16) << 16; - return copy_to_mode_reg (mode, gen_int_mode (v, mode)); - } - - if (valmode == VOIDmode) - valmode = QImode; - if (valmode != QImode) - val = gen_lowpart (QImode, val); - if (mode == QImode) - return val; - if (!TARGET_PARTIAL_REG_STALL) - nops--; - if (ix86_cost->mult_init[mode == DImode ? 3 : 2] - + ix86_cost->mult_bit * (mode == DImode ? 8 : 4) - <= (ix86_cost->shift_const + ix86_cost->add) * nops - + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0))) - { - rtx reg = convert_modes (mode, QImode, val, true); - tmp = promote_duplicated_reg (mode, const1_rtx); - return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1, - OPTAB_DIRECT); - } - else - { - rtx reg = convert_modes (mode, QImode, val, true); - - if (!TARGET_PARTIAL_REG_STALL) - if (mode == SImode) - emit_insn (gen_movsi_insv_1 (reg, reg)); - else - emit_insn (gen_movdi_insv_1 (reg, reg)); + if (force_loopy_epilogue) + expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp, + epilogue_size_needed); else { - tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8), - NULL, 1, OPTAB_DIRECT); - reg = - expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT); + if (issetmem) + expand_setmem_epilogue (dst, destreg, promoted_val, + vec_promoted_val, count_exp, + epilogue_size_needed); + else + expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp, + epilogue_size_needed); } - tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16), - NULL, 1, OPTAB_DIRECT); - reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT); - if (mode == SImode) - return reg; - tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32), - NULL, 1, OPTAB_DIRECT); - reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT); - return reg; } + if (jump_around_label) + emit_label (jump_around_label); + return true; } -/* Duplicate value VAL using promote_duplicated_reg into maximal size that will - be needed by main loop copying SIZE_NEEDED chunks and prologue getting - alignment from ALIGN to DESIRED_ALIGN. */ -static rtx -promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align) +/* Wrapper for ix86_expand_movmem_or_setmem for memcpy case. */ +bool +ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp, + rtx expected_align_exp, rtx expected_size_exp) { - rtx promoted_val; - - if (TARGET_64BIT - && (size_needed > 4 || (desired_align > align && desired_align > 4))) - promoted_val = promote_duplicated_reg (DImode, val); - else if (size_needed > 2 || (desired_align > align && desired_align > 2)) - promoted_val = promote_duplicated_reg (SImode, val); - else if (size_needed > 1 || (desired_align > align && desired_align > 1)) - promoted_val = promote_duplicated_reg (HImode, val); - else - promoted_val = val; - - return promoted_val; + return ix86_expand_movmem_or_setmem (dst, src, count_exp, NULL, align_exp, + expected_align_exp, expected_size_exp, false); } -/* Expand string clear operation (bzero). Use i386 string operations when - profitable. See expand_movmem comment for explanation of individual - steps performed. */ +/* Wrapper for ix86_expand_movmem_or_setmem for memset case. */ bool ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp, rtx expected_align_exp, rtx expected_size_exp) { - rtx destreg; - rtx label = NULL; - rtx tmp; - rtx jump_around_label = NULL; - HOST_WIDE_INT align = 1; - unsigned HOST_WIDE_INT count = 0; - HOST_WIDE_INT expected_size = -1; - int size_needed = 0, epilogue_size_needed; - int desired_align = 0, align_bytes = 0; - enum stringop_alg alg; - rtx promoted_val = NULL; - bool force_loopy_epilogue = false; - int dynamic_check; - bool need_zero_guard = false; - bool noalign; - enum machine_mode move_mode = VOIDmode; - int unroll_factor; - - if (CONST_INT_P (align_exp)) - align = INTVAL (align_exp); - /* i386 can do misaligned access on reasonably increased cost. */ - if (CONST_INT_P (expected_align_exp) - && INTVAL (expected_align_exp) > align) - align = INTVAL (expected_align_exp); - if (CONST_INT_P (count_exp)) - count = expected_size = INTVAL (count_exp); - if (CONST_INT_P (expected_size_exp) && count == 0) - expected_size = INTVAL (expected_size_exp); - - /* Make sure we don't need to care about overflow later on. */ - if (count > ((unsigned HOST_WIDE_INT) 1 << 30)) - return false; - - /* Step 0: Decide on preferred algorithm, desired alignment and - size of chunks to be copied by main loop. */ - - alg = decide_alg (count, expected_size, true, &dynamic_check, &noalign); - if (alg == libcall) - return false; - gcc_assert (alg != no_stringop); - - if (!count) - count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp); - destreg = copy_addr_to_reg (XEXP (dst, 0)); - - move_mode = word_mode; - unroll_factor = 1; - switch (alg) - { - case libcall: - case no_stringop: - case last_alg: - gcc_unreachable (); - case loop: - need_zero_guard = true; - break; - case vector_loop: - case unrolled_loop: - need_zero_guard = true; - unroll_factor = 4; - break; - case rep_prefix_8_byte: - move_mode = DImode; - break; - case rep_prefix_4_byte: - move_mode = SImode; - break; - case rep_prefix_1_byte: - move_mode = QImode; - break; - case loop_1_byte: - need_zero_guard = true; - move_mode = QImode; - break; - } - size_needed = GET_MODE_SIZE (move_mode) * unroll_factor; - epilogue_size_needed = size_needed; - - desired_align = decide_alignment (align, alg, expected_size, move_mode); - if (!TARGET_ALIGN_STRINGOPS || noalign) - align = desired_align; - - /* Step 1: Prologue guard. */ - - /* Alignment code needs count to be in register. */ - if (CONST_INT_P (count_exp) && desired_align > align) - { - if (INTVAL (count_exp) > desired_align - && INTVAL (count_exp) > size_needed) - { - align_bytes - = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT); - if (align_bytes <= 0) - align_bytes = 0; - else - align_bytes = desired_align - align_bytes; - } - if (align_bytes == 0) - { - enum machine_mode mode = SImode; - if (TARGET_64BIT && (count & ~0xffffffff)) - mode = DImode; - count_exp = force_reg (mode, count_exp); - } - } - /* Do the cheap promotion to allow better CSE across the - main loop and epilogue (ie one load of the big constant in the - front of all code. */ - if (CONST_INT_P (val_exp)) - promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed, - desired_align, align); - /* Ensure that alignment prologue won't copy past end of block. */ - if (size_needed > 1 || (desired_align > 1 && desired_align > align)) - { - epilogue_size_needed = MAX (size_needed - 1, desired_align - align); - /* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes. - Make sure it is power of 2. */ - epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1); - - /* To improve performance of small blocks, we jump around the VAL - promoting mode. This mean that if the promoted VAL is not constant, - we might not use it in the epilogue and have to use byte - loop variant. */ - if (epilogue_size_needed > 2 && !promoted_val) - force_loopy_epilogue = true; - if (count) - { - if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed) - { - /* If main algorithm works on QImode, no epilogue is needed. - For small sizes just don't align anything. */ - if (size_needed == 1) - desired_align = align; - else - goto epilogue; - } - } - else - { - label = gen_label_rtx (); - emit_cmp_and_jump_insns (count_exp, - GEN_INT (epilogue_size_needed), - LTU, 0, counter_mode (count_exp), 1, label); - if (expected_size == -1 || expected_size <= epilogue_size_needed) - predict_jump (REG_BR_PROB_BASE * 60 / 100); - else - predict_jump (REG_BR_PROB_BASE * 20 / 100); - } - } - if (dynamic_check != -1) - { - rtx hot_label = gen_label_rtx (); - jump_around_label = gen_label_rtx (); - emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1), - LEU, 0, counter_mode (count_exp), 1, hot_label); - predict_jump (REG_BR_PROB_BASE * 90 / 100); - set_storage_via_libcall (dst, count_exp, val_exp, false); - emit_jump (jump_around_label); - emit_label (hot_label); - } - - /* Step 2: Alignment prologue. */ - - /* Do the expensive promotion once we branched off the small blocks. */ - if (!promoted_val) - promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed, - desired_align, align); - gcc_assert (desired_align >= 1 && align >= 1); - - if (desired_align > align) - { - if (align_bytes == 0) - { - /* Except for the first move in epilogue, we no longer know - constant offset in aliasing info. It don't seems to worth - the pain to maintain it for the first move, so throw away - the info early. */ - dst = change_address (dst, BLKmode, destreg); - expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align, - desired_align); - } - else - { - /* If we know how many bytes need to be stored before dst is - sufficiently aligned, maintain aliasing info accurately. */ - dst = expand_constant_setmem_prologue (dst, destreg, promoted_val, - desired_align, align_bytes); - count_exp = plus_constant (counter_mode (count_exp), - count_exp, -align_bytes); - count -= align_bytes; - } - if (need_zero_guard - && (count < (unsigned HOST_WIDE_INT) size_needed - || (align_bytes == 0 - && count < ((unsigned HOST_WIDE_INT) size_needed - + desired_align - align)))) - { - /* It is possible that we copied enough so the main loop will not - execute. */ - gcc_assert (size_needed > 1); - if (label == NULL_RTX) - label = gen_label_rtx (); - emit_cmp_and_jump_insns (count_exp, - GEN_INT (size_needed), - LTU, 0, counter_mode (count_exp), 1, label); - if (expected_size == -1 - || expected_size < (desired_align - align) / 2 + size_needed) - predict_jump (REG_BR_PROB_BASE * 20 / 100); - else - predict_jump (REG_BR_PROB_BASE * 60 / 100); - } - } - if (label && size_needed == 1) - { - emit_label (label); - LABEL_NUSES (label) = 1; - label = NULL; - promoted_val = val_exp; - epilogue_size_needed = 1; - } - else if (label == NULL_RTX) - epilogue_size_needed = size_needed; - - /* Step 3: Main loop. */ - - switch (alg) - { - case libcall: - case no_stringop: - case last_alg: - gcc_unreachable (); - case loop_1_byte: - case loop: - case vector_loop: - case unrolled_loop: - expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val, - count_exp, move_mode, unroll_factor, - expected_size); - break; - case rep_prefix_8_byte: - expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp, - DImode, val_exp); - break; - case rep_prefix_4_byte: - expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp, - SImode, val_exp); - break; - case rep_prefix_1_byte: - expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp, - QImode, val_exp); - break; - } - /* Adjust properly the offset of src and dest memory for aliasing. */ - if (CONST_INT_P (count_exp)) - dst = adjust_automodify_address_nv (dst, BLKmode, destreg, - (count / size_needed) * size_needed); - else - dst = change_address (dst, BLKmode, destreg); - - /* Step 4: Epilogue to copy the remaining bytes. */ - - if (label) - { - /* When the main loop is done, COUNT_EXP might hold original count, - while we want to copy only COUNT_EXP & SIZE_NEEDED bytes. - Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED - bytes. Compensate if needed. */ - - if (size_needed < epilogue_size_needed) - { - tmp = - expand_simple_binop (counter_mode (count_exp), AND, count_exp, - GEN_INT (size_needed - 1), count_exp, 1, - OPTAB_DIRECT); - if (tmp != count_exp) - emit_move_insn (count_exp, tmp); - } - emit_label (label); - LABEL_NUSES (label) = 1; - } - epilogue: - if (count_exp != const0_rtx && epilogue_size_needed > 1) - { - if (force_loopy_epilogue) - expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp, - epilogue_size_needed); - else - expand_setmem_epilogue (dst, destreg, promoted_val, count_exp, - epilogue_size_needed); - } - if (jump_around_label) - emit_label (jump_around_label); - return true; + return ix86_expand_movmem_or_setmem (dst, NULL, count_exp, val_exp, align_exp, + expected_align_exp, expected_size_exp, true); } + /* Expand the appropriate insns for doing strlen if not just doing repnz; scasb diff --git a/gcc/testsuite/gcc.target/i386/memset-vector_loop-1.c b/gcc/testsuite/gcc.target/i386/memset-vector_loop-1.c new file mode 100644 index 0000000..ad0d130 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memset-vector_loop-1.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=atom -minline-all-stringops -mstringop-strategy=vector_loop" } */ +/* { dg-final { scan-assembler-times "movdqa" 4 } } */ + +char a[2048]; +void t (void) +{ + __builtin_memset (a, 0, 2048); +} + + diff --git a/gcc/testsuite/gcc.target/i386/memset-vector_loop-2.c b/gcc/testsuite/gcc.target/i386/memset-vector_loop-2.c new file mode 100644 index 0000000..f2ceb44 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memset-vector_loop-2.c @@ -0,0 +1,10 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=atom -minline-all-stringops -mstringop-strategy=vector_loop" } */ +/* { dg-final { scan-assembler-times "movdqa" 4} } */ + +char *a; +void t (void) +{ + __builtin_memset (a, 0, 2048); +} + -- 1.7.11.7