Hello! Attached patch prevents unwanted peephole2 matching. With SSE, we can expand atomic load through XMM registers using generic DImode move patterns. It can happen that peephole2 matches unrelated insn sequence, and this way removes random FP store to memory.
The patch introduces specialized SSE move patterns for this purpose. 2017-05-11 Uros Bizjak <ubiz...@gmail.com> PR target/80706 * config/i386/sync.md (UNSPEC_LDX_ATOMIC): New unspec. (UNSPEC_STX_ATOMIC): Ditto. (loaddi_via_sse): New insn. (storedi_via_sse): Ditto. (atomic_loaddi_fpu): Emit loaddi_via_sse and storedi_via_sse. Update corresponding peephole2 patterns. (atomic_storedi_fpu): Ditto. testsuite/ChangeLog: 2017-05-11 Uros Bizjak <ubiz...@gmail.com> Jakub Jelinek <ja...@redhat.com> PR target/80706 * gcc.target/i386/pr80706.c: New test. Patch was bootstrapped and regression tested on x86_64-linux-gnu {,-m32}. Committed to mainline, will be backported to gcc-7 branch. Uros.
Index: config/i386/sync.md =================================================================== --- config/i386/sync.md (revision 247914) +++ config/i386/sync.md (working copy) @@ -25,6 +25,9 @@ UNSPEC_FILD_ATOMIC UNSPEC_FIST_ATOMIC + UNSPEC_LDX_ATOMIC + UNSPEC_STX_ATOMIC + ;; __atomic support UNSPEC_LDA UNSPEC_STA @@ -199,9 +202,8 @@ } else { - adjust_reg_mode (tmp, DImode); - emit_move_insn (tmp, src); - emit_move_insn (mem, tmp); + emit_insn (gen_loaddi_via_sse (tmp, src)); + emit_insn (gen_storedi_via_sse (mem, tmp)); } if (mem != dst) @@ -226,10 +228,12 @@ "operands[5] = gen_lowpart (DFmode, operands[1]);") (define_peephole2 - [(set (match_operand:DI 0 "sse_reg_operand") - (match_operand:DI 1 "memory_operand")) + [(set (match_operand:DF 0 "sse_reg_operand") + (unspec:DF [(match_operand:DI 1 "memory_operand")] + UNSPEC_LDX_ATOMIC)) (set (match_operand:DI 2 "memory_operand") - (match_dup 0)) + (unspec:DI [(match_dup 0)] + UNSPEC_STX_ATOMIC)) (set (match_operand:DF 3 "fp_register_operand") (match_operand:DF 4 "memory_operand"))] "!TARGET_64BIT @@ -301,7 +305,9 @@ rtx dst = operands[0], src = operands[1]; rtx mem = operands[2], tmp = operands[3]; - if (!SSE_REG_P (src)) + if (SSE_REG_P (src)) + emit_move_insn (dst, src); + else { if (REG_P (src)) { @@ -313,16 +319,13 @@ { emit_insn (gen_loaddi_via_fpu (tmp, src)); emit_insn (gen_storedi_via_fpu (dst, tmp)); - DONE; } else { - adjust_reg_mode (tmp, DImode); - emit_move_insn (tmp, src); - src = tmp; + emit_insn (gen_loaddi_via_sse (tmp, src)); + emit_insn (gen_storedi_via_sse (dst, tmp)); } } - emit_move_insn (dst, src); DONE; }) @@ -344,10 +347,12 @@ (define_peephole2 [(set (match_operand:DF 0 "memory_operand") (match_operand:DF 1 "fp_register_operand")) - (set (match_operand:DI 2 "sse_reg_operand") - (match_operand:DI 3 "memory_operand")) + (set (match_operand:DF 2 "sse_reg_operand") + (unspec:DF [(match_operand:DI 3 "memory_operand")] + UNSPEC_LDX_ATOMIC)) (set (match_operand:DI 4 "memory_operand") - (match_dup 2))] + (unspec:DI [(match_dup 2)] + UNSPEC_STX_ATOMIC))] "!TARGET_64BIT && peep2_reg_dead_p (3, operands[2]) && rtx_equal_p (operands[0], adjust_address_nv (operands[3], DFmode, 0))" @@ -382,6 +387,32 @@ [(set_attr "type" "fmov") (set_attr "mode" "DI")]) +(define_insn "loaddi_via_sse" + [(set (match_operand:DF 0 "register_operand" "=x") + (unspec:DF [(match_operand:DI 1 "memory_operand" "m")] + UNSPEC_LDX_ATOMIC))] + "TARGET_SSE" +{ + if (TARGET_SSE2) + return "%vmovq\t{%1, %0|%0, %1}"; + return "movlps\t{%1, %0|%0, %1}"; +} + [(set_attr "type" "ssemov") + (set_attr "mode" "DI")]) + +(define_insn "storedi_via_sse" + [(set (match_operand:DI 0 "memory_operand" "=m") + (unspec:DI [(match_operand:DF 1 "register_operand" "x")] + UNSPEC_STX_ATOMIC))] + "TARGET_SSE" +{ + if (TARGET_SSE2) + return "%vmovq\t{%1, %0|%0, %1}"; + return "movlps\t{%1, %0|%0, %1}"; +} + [(set_attr "type" "ssemov") + (set_attr "mode" "DI")]) + (define_expand "atomic_compare_and_swap<mode>" [(match_operand:QI 0 "register_operand") ;; bool success output (match_operand:SWI124 1 "register_operand") ;; oldval output Index: testsuite/gcc.target/i386/pr80706.c =================================================================== --- testsuite/gcc.target/i386/pr80706.c (nonexistent) +++ testsuite/gcc.target/i386/pr80706.c (working copy) @@ -0,0 +1,30 @@ +/* PR target/80706 */ +/* { dg-do run { target sse2_runtime } } */ +/* { dg-options "-O2 -msse2" } */ + +union U { double value; struct S { int lsw; int msw; } parts; }; + +__attribute__((noinline, noclone)) double +foo (void) +{ + __asm volatile ("" : : : "memory"); + return 2.0; +} + +__attribute__((noinline, noclone)) double +bar (void) +{ + double s = foo (); + union U z; + z.value = s; + z.parts.lsw = 0; + return z.value * z.value + s * s; +} + +int +main () +{ + if (bar () != 8.0) + __builtin_abort (); + return 0; +}