On Mon, Oct 15, 2012 at 6:39 PM, Ulrich Weigand <uweig...@de.ibm.com> wrote:

>> > I was wondering if the i386 port maintainers could have a look at this
>> > pattern.  Shouldn't we really have two patterns, one to *load* an unaligned
>> > value and one to *store* and unaligned value, and not permit that memory
>> > access to get reloaded?
>>
>> Please find attached a fairly mechanical patch that splits
>> move_unaligned pattern into load_unaligned and store_unaligned
>> patterns. We've had some problems with this pattern, and finally we
>> found the reason to make unaligned moves more robust.
>>
>> I will wait for the confirmation that attached patch avoids the
>> failure you are seeing with your reload patch.
>
> Yes, this patch does in fact fix the failure I was seeing with the
> reload patch.  (A full regression test shows a couple of extra fails:
> FAIL: gcc.target/i386/avx256-unaligned-load-1.c scan-assembler sse_movups/1
> FAIL: gcc.target/i386/avx256-unaligned-load-3.c scan-assembler sse2_movupd/1
> FAIL: gcc.target/i386/avx256-unaligned-load-4.c scan-assembler avx_movups256/1
> FAIL: gcc.target/i386/avx256-unaligned-store-4.c scan-assembler 
> avx_movups256/2
> But I guess these tests simply need to be updated for the new pattern names.)

2012-10-15  Uros Bizjak  <ubiz...@gmail.com>

        * config/i386/sse.md (UNSPEC_MOVU): Remove.
        (UNSPEC_LOADU): New.
        (UNSPEC_STOREU): Ditto.
        (<sse>_movu<ssemodesuffix><avxsizesuffix>): Split to ...
        (<sse>_loadu<ssemodesuffix><avxsizesuffix>): ... this and ...
        (<sse>_storeu<ssemodesuffix><avxsizesuffix>) ... this.
        (<sse2>_movdqu<avxsizesuffix>): Split to ...
        (<sse2>_loaddqu<avxsizesuffix>): ... this and ...
        (<sse2>_storedqu<avxsizesuffix>): ... this.
        (*sse4_2_pcmpestr_unaligned): Update.
        (*sse4_2_pcmpistr_unaligned): Ditto.

        * config/i386/i386.c (ix86_avx256_split_vector_move_misalign): Use
        gen_avx_load{dqu,ups,upd}256 to load from unaligned memory and
        gen_avx_store{dqu,ups,upd}256 to store to unaligned memory.
        (ix86_expand_vector_move_misalign): Use gen_sse_loadups or
        gen_sse2_load{dqu,upd} to load from unaligned memory and
        gen_sse_loadups or gen_sse2_store{dqu,upd}256 to store to
        unaligned memory.
        (struct builtin_description bdesc_spec) <IX86_BUILTIN_LOADUPS>:
        Use CODE_FOR_sse_loadups.
        <IX86_BUILTIN_LOADUPD>: Use CODE_FOR_sse2_loadupd.
        <IX86_BUILTIN_LOADDQU>: Use CODE_FOR_sse2_loaddqu.
        <IX86_BUILTIN_STOREUPS>: Use CODE_FOR_sse_storeups.
        <IX86_BUILTIN_STOREUPD>: Use CODE_FOR_sse2_storeupd.
        <IX86_BUILTIN_STOREDQU>: Use CODE_FOR_sse2_storedqu.
        <IX86_BUILTIN_LOADUPS256>: Use CODE_FOR_avx_loadups256.
        <IX86_BUILTIN_LOADUPD256>: Use CODE_FOR_avx_loadupd256.
        <IX86_BUILTIN_LOADDQU256>: Use CODE_FOR_avx_loaddqu256.
        <IX86_BUILTIN_STOREUPS256>: Use CODE_FOR_avx_storeups256.
        <IX86_BUILTIN_STOREUPD256>: Use CODE_FOR_avx_storeupd256.
        <IX86_BUILTIN_STOREDQU256>: Use CODE_FOR_avx_storedqu256.

testsuite/ChangeLog:

2012-10-15  Uros Bizjak  <ubiz...@gmail.com>

        * gcc.target/i386/avx256-unaligned-load-1.c: Update asm scan patterns.
        * gcc.target/i386/avx256-unaligned-load-2.c: Ditto.
        * gcc.target/i386/avx256-unaligned-load-3.c: Ditto.
        * gcc.target/i386/avx256-unaligned-load-4.c: Ditto.
        * gcc.target/i386/avx256-unaligned-store-1.c: Ditto.
        * gcc.target/i386/avx256-unaligned-store-2.c: Ditto.
        * gcc.target/i386/avx256-unaligned-store-3.c: Ditto.
        * gcc.target/i386/avx256-unaligned-store-4.c: Ditto.

Patch was bootstrapped and regression tested on x86_64-pc-linux-gnu {,-m32}.

Committed to mainline, similar patch will be committed to 4.7 branch.

Uros.
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c  (revision 192420)
+++ config/i386/i386.c  (working copy)
@@ -16059,7 +16059,8 @@ ix86_avx256_split_vector_move_misalign (rtx op0, r
 {
   rtx m;
   rtx (*extract) (rtx, rtx, rtx);
-  rtx (*move_unaligned) (rtx, rtx);
+  rtx (*load_unaligned) (rtx, rtx);
+  rtx (*store_unaligned) (rtx, rtx);
   enum machine_mode mode;
 
   switch (GET_MODE (op0))
@@ -16068,39 +16069,52 @@ ix86_avx256_split_vector_move_misalign (rtx op0, r
       gcc_unreachable ();
     case V32QImode:
       extract = gen_avx_vextractf128v32qi;
-      move_unaligned = gen_avx_movdqu256;
+      load_unaligned = gen_avx_loaddqu256;
+      store_unaligned = gen_avx_storedqu256;
       mode = V16QImode;
       break;
     case V8SFmode:
       extract = gen_avx_vextractf128v8sf;
-      move_unaligned = gen_avx_movups256;
+      load_unaligned = gen_avx_loadups256;
+      store_unaligned = gen_avx_storeups256;
       mode = V4SFmode;
       break;
     case V4DFmode:
       extract = gen_avx_vextractf128v4df;
-      move_unaligned = gen_avx_movupd256;
+      load_unaligned = gen_avx_loadupd256;
+      store_unaligned = gen_avx_storeupd256;
       mode = V2DFmode;
       break;
     }
 
-  if (MEM_P (op1) && TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
+  if (MEM_P (op1))
     {
-      rtx r = gen_reg_rtx (mode);
-      m = adjust_address (op1, mode, 0);
-      emit_move_insn (r, m);
-      m = adjust_address (op1, mode, 16);
-      r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
-      emit_move_insn (op0, r);
+      if (TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
+       {
+         rtx r = gen_reg_rtx (mode);
+         m = adjust_address (op1, mode, 0);
+         emit_move_insn (r, m);
+         m = adjust_address (op1, mode, 16);
+         r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
+         emit_move_insn (op0, r);
+       }
+      else
+       emit_insn (load_unaligned (op0, op1));
     }
-  else if (MEM_P (op0) && TARGET_AVX256_SPLIT_UNALIGNED_STORE)
+  else if (MEM_P (op0))
     {
-      m = adjust_address (op0, mode, 0);
-      emit_insn (extract (m, op1, const0_rtx));
-      m = adjust_address (op0, mode, 16);
-      emit_insn (extract (m, op1, const1_rtx));
+      if (TARGET_AVX256_SPLIT_UNALIGNED_STORE)
+       {
+         m = adjust_address (op0, mode, 0);
+         emit_insn (extract (m, op1, const0_rtx));
+         m = adjust_address (op0, mode, 16);
+         emit_insn (extract (m, op1, const1_rtx));
+       }
+      else
+       emit_insn (store_unaligned (op0, op1));
     }
   else
-    emit_insn (move_unaligned (op0, op1));
+    gcc_unreachable ();
 }
 
 /* Implement the movmisalign patterns for SSE.  Non-SSE modes go
@@ -16195,7 +16209,7 @@ ix86_expand_vector_move_misalign (enum machine_mod
          op0 = gen_lowpart (V16QImode, op0);
          op1 = gen_lowpart (V16QImode, op1);
          /* We will eventually emit movups based on insn attributes.  */
-         emit_insn (gen_sse2_movdqu (op0, op1));
+         emit_insn (gen_sse2_loaddqu (op0, op1));
        }
       else if (TARGET_SSE2 && mode == V2DFmode)
         {
@@ -16207,7 +16221,7 @@ ix86_expand_vector_move_misalign (enum machine_mod
              || optimize_function_for_size_p (cfun))
            {
              /* We will eventually emit movups based on insn attributes.  */
-             emit_insn (gen_sse2_movupd (op0, op1));
+             emit_insn (gen_sse2_loadupd (op0, op1));
              return;
            }
 
@@ -16245,7 +16259,7 @@ ix86_expand_vector_move_misalign (enum machine_mod
            {
              op0 = gen_lowpart (V4SFmode, op0);
              op1 = gen_lowpart (V4SFmode, op1);
-             emit_insn (gen_sse_movups (op0, op1));
+             emit_insn (gen_sse_loadups (op0, op1));
              return;
             }
 
@@ -16270,7 +16284,7 @@ ix86_expand_vector_move_misalign (enum machine_mod
          op0 = gen_lowpart (V16QImode, op0);
          op1 = gen_lowpart (V16QImode, op1);
          /* We will eventually emit movups based on insn attributes.  */
-         emit_insn (gen_sse2_movdqu (op0, op1));
+         emit_insn (gen_sse2_storedqu (op0, op1));
        }
       else if (TARGET_SSE2 && mode == V2DFmode)
        {
@@ -16279,7 +16293,7 @@ ix86_expand_vector_move_misalign (enum machine_mod
              || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
              || optimize_function_for_size_p (cfun))
            /* We will eventually emit movups based on insn attributes.  */
-           emit_insn (gen_sse2_movupd (op0, op1));
+           emit_insn (gen_sse2_storeupd (op0, op1));
          else
            {
              m = adjust_address (op0, DFmode, 0);
@@ -16299,7 +16313,7 @@ ix86_expand_vector_move_misalign (enum machine_mod
              || optimize_function_for_size_p (cfun))
            {
              op0 = gen_lowpart (V4SFmode, op0);
-             emit_insn (gen_sse_movups (op0, op1));
+             emit_insn (gen_sse_storeups (op0, op1));
            }
          else
            {
@@ -26765,9 +26779,9 @@ static const struct builtin_description bdesc_spec
   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", 
IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
 
   /* SSE */
-  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movups, "__builtin_ia32_storeups", 
IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storeups, "__builtin_ia32_storeups", 
IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", 
IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
-  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movups, "__builtin_ia32_loadups", 
IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadups, "__builtin_ia32_loadups", 
IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
 
   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", 
IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", 
IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
@@ -26781,14 +26795,14 @@ static const struct builtin_description bdesc_spec
   /* SSE2 */
   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", 
IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, 
UNKNOWN, (int) VOID_FTYPE_VOID },
-  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_storeupd", 
IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
-  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_storedqu", 
IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storeupd, "__builtin_ia32_storeupd", 
IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storedqu, "__builtin_ia32_storedqu", 
IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", 
IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", 
IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", 
IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
   { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, 
"__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) 
VOID_FTYPE_PLONGLONG_LONGLONG },
-  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_loadupd", 
IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
-  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_loaddqu", 
IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadupd, "__builtin_ia32_loadupd", 
IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loaddqu, "__builtin_ia32_loaddqu", 
IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
 
   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", 
IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", 
IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
@@ -26813,12 +26827,12 @@ static const struct builtin_description bdesc_spec
   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, 
"__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, 
(int) V4DF_FTYPE_PCV2DF },
   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, 
"__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, 
(int) V8SF_FTYPE_PCV4SF },
 
-  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_loadupd256", 
IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
-  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_loadups256", 
IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
-  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_storeupd256", 
IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
-  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_storeups256", 
IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
-  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_loaddqu256", 
IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
-  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_storedqu256", 
IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadupd256, "__builtin_ia32_loadupd256", 
IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadups256, "__builtin_ia32_loadups256", 
IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeupd256, 
"__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) 
VOID_FTYPE_PDOUBLE_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeups256, 
"__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) 
VOID_FTYPE_PFLOAT_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loaddqu256, "__builtin_ia32_loaddqu256", 
IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storedqu256, 
"__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) 
VOID_FTYPE_PCHAR_V32QI },
   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", 
IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
 
   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", 
IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
Index: config/i386/sse.md
===================================================================
--- config/i386/sse.md  (revision 192420)
+++ config/i386/sse.md  (working copy)
@@ -21,7 +21,8 @@
 (define_c_enum "unspec" [
   ;; SSE
   UNSPEC_MOVNT
-  UNSPEC_MOVU
+  UNSPEC_LOADU
+  UNSPEC_STOREU
 
   ;; SSE3
   UNSPEC_LDDQU
@@ -586,12 +587,12 @@
   DONE;
 })
 
-(define_insn "<sse>_movu<ssemodesuffix><avxsizesuffix>"
-  [(set (match_operand:VF 0 "nonimmediate_operand" "=x,m")
+(define_insn "<sse>_loadu<ssemodesuffix><avxsizesuffix>"
+  [(set (match_operand:VF 0 "register_operand" "=x")
        (unspec:VF
-         [(match_operand:VF 1 "nonimmediate_operand" "xm,x")]
-         UNSPEC_MOVU))]
-  "TARGET_SSE && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+         [(match_operand:VF 1 "memory_operand" "m")]
+         UNSPEC_LOADU))]
+  "TARGET_SSE"
 {
   switch (get_attr_mode (insn))
     {
@@ -618,11 +619,12 @@
              ]
              (const_string "<MODE>")))])
 
-(define_insn "<sse2>_movdqu<avxsizesuffix>"
-  [(set (match_operand:VI1 0 "nonimmediate_operand" "=x,m")
-       (unspec:VI1 [(match_operand:VI1 1 "nonimmediate_operand" "xm,x")]
-                   UNSPEC_MOVU))]
-  "TARGET_SSE2 && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+(define_insn "<sse>_storeu<ssemodesuffix><avxsizesuffix>"
+  [(set (match_operand:VF 0 "memory_operand" "=m")
+       (unspec:VF
+         [(match_operand:VF 1 "register_operand" "x")]
+         UNSPEC_STOREU))]
+  "TARGET_SSE"
 {
   switch (get_attr_mode (insn))
     {
@@ -630,6 +632,37 @@
     case MODE_V4SF:
       return "%vmovups\t{%1, %0|%0, %1}";
     default:
+      return "%vmovu<ssemodesuffix>\t{%1, %0|%0, %1}";
+    }
+}
+  [(set_attr "type" "ssemov")
+   (set_attr "movu" "1")
+   (set_attr "prefix" "maybe_vex")
+   (set (attr "mode")
+       (cond [(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
+                (const_string "<ssePSmode>")
+              (and (eq_attr "alternative" "1")
+                   (match_test "TARGET_SSE_TYPELESS_STORES"))
+                (const_string "<ssePSmode>")
+              (match_test "TARGET_AVX")
+                (const_string "<MODE>")
+              (match_test "optimize_function_for_size_p (cfun)")
+                (const_string "V4SF")
+             ]
+             (const_string "<MODE>")))])
+
+(define_insn "<sse2>_loaddqu<avxsizesuffix>"
+  [(set (match_operand:VI1 0 "register_operand" "=x")
+       (unspec:VI1 [(match_operand:VI1 1 "memory_operand" "m")]
+                   UNSPEC_LOADU))]
+  "TARGET_SSE2"
+{
+  switch (get_attr_mode (insn))
+    {
+    case MODE_V8SF:
+    case MODE_V4SF:
+      return "%vmovups\t{%1, %0|%0, %1}";
+    default:
       return "%vmovdqu\t{%1, %0|%0, %1}";
     }
 }
@@ -654,6 +687,42 @@
              ]
              (const_string "<sseinsnmode>")))])
 
+(define_insn "<sse2>_storedqu<avxsizesuffix>"
+  [(set (match_operand:VI1 0 "memory_operand" "=m")
+       (unspec:VI1 [(match_operand:VI1 1 "register_operand" "x")]
+                   UNSPEC_STOREU))]
+  "TARGET_SSE2"
+{
+  switch (get_attr_mode (insn))
+    {
+    case MODE_V8SF:
+    case MODE_V4SF:
+      return "%vmovups\t{%1, %0|%0, %1}";
+    default:
+      return "%vmovdqu\t{%1, %0|%0, %1}";
+    }
+}
+  [(set_attr "type" "ssemov")
+   (set_attr "movu" "1")
+   (set (attr "prefix_data16")
+     (if_then_else
+       (match_test "TARGET_AVX")
+     (const_string "*")
+     (const_string "1")))
+   (set_attr "prefix" "maybe_vex")
+   (set (attr "mode")
+       (cond [(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
+                (const_string "<ssePSmode>")
+              (and (eq_attr "alternative" "1")
+                   (match_test "TARGET_SSE_TYPELESS_STORES"))
+                (const_string "<ssePSmode>")
+              (match_test "TARGET_AVX")
+                (const_string "<sseinsnmode>")
+              (match_test "optimize_function_for_size_p (cfun)")
+                (const_string "V4SF")
+             ]
+             (const_string "<sseinsnmode>")))])
+
 (define_insn "<sse3>_lddqu<avxsizesuffix>"
   [(set (match_operand:VI1 0 "register_operand" "=x")
        (unspec:VI1 [(match_operand:VI1 1 "memory_operand" "m")]
@@ -9307,7 +9376,7 @@
           (match_operand:SI 3 "register_operand" "a")
           (unspec:V16QI
             [(match_operand:V16QI 4 "memory_operand" "m")]
-            UNSPEC_MOVU)
+            UNSPEC_LOADU)
           (match_operand:SI 5 "register_operand" "d")
           (match_operand:SI 6 "const_0_to_255_operand" "n")]
          UNSPEC_PCMPESTR))
@@ -9315,7 +9384,7 @@
        (unspec:V16QI
          [(match_dup 2)
           (match_dup 3)
-          (unspec:V16QI [(match_dup 4)] UNSPEC_MOVU)
+          (unspec:V16QI [(match_dup 4)] UNSPEC_LOADU)
           (match_dup 5)
           (match_dup 6)]
          UNSPEC_PCMPESTR))
@@ -9323,7 +9392,7 @@
        (unspec:CC
          [(match_dup 2)
           (match_dup 3)
-          (unspec:V16QI [(match_dup 4)] UNSPEC_MOVU)
+          (unspec:V16QI [(match_dup 4)] UNSPEC_LOADU)
           (match_dup 5)
           (match_dup 6)]
          UNSPEC_PCMPESTR))]
@@ -9498,19 +9567,19 @@
          [(match_operand:V16QI 2 "register_operand" "x")
           (unspec:V16QI
             [(match_operand:V16QI 3 "memory_operand" "m")]
-            UNSPEC_MOVU)
+            UNSPEC_LOADU)
           (match_operand:SI 4 "const_0_to_255_operand" "n")]
          UNSPEC_PCMPISTR))
    (set (match_operand:V16QI 1 "register_operand" "=Yz")
        (unspec:V16QI
          [(match_dup 2)
-          (unspec:V16QI [(match_dup 3)] UNSPEC_MOVU)
+          (unspec:V16QI [(match_dup 3)] UNSPEC_LOADU)
           (match_dup 4)]
          UNSPEC_PCMPISTR))
    (set (reg:CC FLAGS_REG)
        (unspec:CC
          [(match_dup 2)
-          (unspec:V16QI [(match_dup 3)] UNSPEC_MOVU)
+          (unspec:V16QI [(match_dup 3)] UNSPEC_LOADU)
           (match_dup 4)]
          UNSPEC_PCMPISTR))]
   "TARGET_SSE4_2
Index: testsuite/gcc.target/i386/avx256-unaligned-load-1.c
===================================================================
--- testsuite/gcc.target/i386/avx256-unaligned-load-1.c (revision 192420)
+++ testsuite/gcc.target/i386/avx256-unaligned-load-1.c (working copy)
@@ -14,6 +14,6 @@ avx_test (void)
     c[i] = a[i] * b[i+3];
 }
 
-/* { dg-final { scan-assembler-not "avx_movups256/1" } } */
-/* { dg-final { scan-assembler "sse_movups/1" } } */
+/* { dg-final { scan-assembler-not "avx_loadups256" } } */
+/* { dg-final { scan-assembler "sse_loadups" } } */
 /* { dg-final { scan-assembler "vinsertf128" } } */
Index: testsuite/gcc.target/i386/avx256-unaligned-load-2.c
===================================================================
--- testsuite/gcc.target/i386/avx256-unaligned-load-2.c (revision 192420)
+++ testsuite/gcc.target/i386/avx256-unaligned-load-2.c (working copy)
@@ -24,6 +24,6 @@ avx_test (void)
     }
 }
 
-/* { dg-final { scan-assembler-not "avx_movdqu256/1" } } */
-/* { dg-final { scan-assembler "sse2_movdqu/1" } } */
+/* { dg-final { scan-assembler-not "avx_loaddqu256" } } */
+/* { dg-final { scan-assembler "sse2_loaddqu" } } */
 /* { dg-final { scan-assembler "vinsert.128" } } */
Index: testsuite/gcc.target/i386/avx256-unaligned-load-3.c
===================================================================
--- testsuite/gcc.target/i386/avx256-unaligned-load-3.c (revision 192420)
+++ testsuite/gcc.target/i386/avx256-unaligned-load-3.c (working copy)
@@ -14,6 +14,6 @@ avx_test (void)
     c[i] = a[i] * b[i+3];
 }
 
-/* { dg-final { scan-assembler-not "avx_movupd256/1" } } */
-/* { dg-final { scan-assembler "sse2_movupd/1" } } */
+/* { dg-final { scan-assembler-not "avx_loadupd256" } } */
+/* { dg-final { scan-assembler "sse2_loadupd" } } */
 /* { dg-final { scan-assembler "vinsertf128" } } */
Index: testsuite/gcc.target/i386/avx256-unaligned-load-4.c
===================================================================
--- testsuite/gcc.target/i386/avx256-unaligned-load-4.c (revision 192420)
+++ testsuite/gcc.target/i386/avx256-unaligned-load-4.c (working copy)
@@ -14,6 +14,6 @@ avx_test (void)
     b[i] = a[i+3] * 2;
 }
 
-/* { dg-final { scan-assembler "avx_movups256/1" } } */
-/* { dg-final { scan-assembler-not "avx_movups/1" } } */
+/* { dg-final { scan-assembler "avx_loadups256" } } */
+/* { dg-final { scan-assembler-not "sse_loadups" } } */
 /* { dg-final { scan-assembler-not "vinsertf128" } } */
Index: testsuite/gcc.target/i386/avx256-unaligned-store-1.c
===================================================================
--- testsuite/gcc.target/i386/avx256-unaligned-store-1.c        (revision 
192420)
+++ testsuite/gcc.target/i386/avx256-unaligned-store-1.c        (working copy)
@@ -17,6 +17,6 @@ avx_test (void)
     d[i] = c[i] * 20.0;
 }
 
-/* { dg-final { scan-assembler-not "avx_movups256/2" } } */
+/* { dg-final { scan-assembler-not "avx_storeups256" } } */
 /* { dg-final { scan-assembler "vmovups.*\\*movv4sf_internal/3" } } */
 /* { dg-final { scan-assembler "vextractf128" } } */
Index: testsuite/gcc.target/i386/avx256-unaligned-store-2.c
===================================================================
--- testsuite/gcc.target/i386/avx256-unaligned-store-2.c        (revision 
192420)
+++ testsuite/gcc.target/i386/avx256-unaligned-store-2.c        (working copy)
@@ -24,6 +24,6 @@ avx_test (void)
     }
 }
 
-/* { dg-final { scan-assembler-not "avx_movdqu256/2" } } */
+/* { dg-final { scan-assembler-not "avx_storedqu256" } } */
 /* { dg-final { scan-assembler "vmovdqu.*\\*movv16qi_internal/3" } } */
 /* { dg-final { scan-assembler "vextract.128" } } */
Index: testsuite/gcc.target/i386/avx256-unaligned-store-3.c
===================================================================
--- testsuite/gcc.target/i386/avx256-unaligned-store-3.c        (revision 
192420)
+++ testsuite/gcc.target/i386/avx256-unaligned-store-3.c        (working copy)
@@ -17,6 +17,6 @@ avx_test (void)
     d[i] = c[i] * 20.0;
 }
 
-/* { dg-final { scan-assembler-not "avx_movupd256/2" } } */
+/* { dg-final { scan-assembler-not "avx_storeupd256" } } */
 /* { dg-final { scan-assembler "vmovupd.*\\*movv2df_internal/3" } } */
 /* { dg-final { scan-assembler "vextractf128" } } */
Index: testsuite/gcc.target/i386/avx256-unaligned-store-4.c
===================================================================
--- testsuite/gcc.target/i386/avx256-unaligned-store-4.c        (revision 
192420)
+++ testsuite/gcc.target/i386/avx256-unaligned-store-4.c        (working copy)
@@ -14,7 +14,7 @@ avx_test (void)
     b[i+3] = a[i] * c[i];
 }
 
-/* { dg-final { scan-assembler "avx_movups256/2" } } */
-/* { dg-final { scan-assembler-not "avx_movups/2" } } */
+/* { dg-final { scan-assembler "avx_storeups256" } } */
+/* { dg-final { scan-assembler-not "sse_storeups" } } */
 /* { dg-final { scan-assembler-not "\\*avx_movv4sf_internal/3" } } */
 /* { dg-final { scan-assembler-not "vextractf128" } } */

Reply via email to