Hello, This patch extends unaligned loads and stores patterns. I've refactored original patch (stored on SVN's branch) toward reducing complexity of conditions in define_insn "<avx512>_storedqu<mode>_mask"
It seems like such a trick won't work for: <sse2_avx_avx512f>_loaddqu<mode><mask_name> Problem is V[32|16]QI modes, which enabled for SSE/AVX w/o masking and for AVX-512BW & AVX-512VL when masking is on. Of course, I can split the define_insn & define_expand into 3 patterns w/ mode iterators of: 1. V16QI, V32QI - baseline is SSE2, masks enabled for AVX-512BW&VL 2. V64QI, V8HI, V16HI, V32HI - baseline is AVX-512BW, masks enabled for AVX-512VL 3. V8DI, V4DI, V2DI, V16SI, V8SI, V4SI - baseline is AVX-512F, masks enabled for AVX-512VL. But such approach will lead to 6 patterns instead of 2 (with non-trivial asm emit). I have doubts if it is useful... Current patch passess bootstrap and shows now regiressions under simulator. What do you think? gcc/ * config/i386/sse.md (define_mode_iterator VI48_AVX512VL): New. (define_mode_iterator VI_UNALIGNED_LOADSTORE): Add V64QI, V32HI, V16HI, V8HI, V4SI, V4DI, V2DI modes. (define_expand "<sse2_avx_avx512f>_loaddqu<mode><mask_name>"): Update condition. (define_insn "*<sse2_avx_avx512f>_loaddqu<mode><mask_name>"): Update condition, handle new modes. (define_insn "<sse2_avx_avx512f>_storedqu<mode>"): Handle new modes. (define_insn "avx512f_storedqu<mode>_mask"): Delete. (define_insn "<avx512>_storedqu<mode>_mask" with VI48_AVX512VL): New. (define_insn "<avx512>_storedqu<mode>_mask" with VI12_AVX512VL): Ditto. -- Thanks, K diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index cd0c08e..51cfada 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -235,6 +235,10 @@ (define_mode_iterator VF_512 [V16SF V8DF]) +(define_mode_iterator VI48_AVX512VL + [V16SI (V8SI "TARGET_AVX512VL") (V4SI "TARGET_AVX512VL") + V8DI (V4DI "TARGET_AVX512VL") (V2DI "TARGET_AVX512VL")]) + (define_mode_iterator VF2_AVX512VL [V8DF (V4DF "TARGET_AVX512VL") (V2DF "TARGET_AVX512VL")]) @@ -260,8 +264,12 @@ [(V32QI "TARGET_AVX") V16QI]) (define_mode_iterator VI_UNALIGNED_LOADSTORE - [(V32QI "TARGET_AVX") V16QI - (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")]) + [(V64QI "TARGET_AVX512BW") (V32QI "TARGET_AVX") V16QI + (V32HI "TARGET_AVX512BW") + (V16HI "TARGET_AVX512BW && TARGET_AVX512VL") + (V8HI "TARGET_AVX512BW && TARGET_AVX512VL") + (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX512VL") (V4SI "TARGET_AVX512VL") + (V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX512VL") (V2DI "TARGET_AVX512VL")]) ;; All DImode vector integer modes (define_mode_iterator VI8 @@ -1172,7 +1180,10 @@ (unspec:VI_UNALIGNED_LOADSTORE [(match_operand:VI_UNALIGNED_LOADSTORE 1 "nonimmediate_operand")] UNSPEC_LOADU))] - "TARGET_SSE2 && <mask_mode512bit_condition>" + "TARGET_SSE2 + && (!<mask_applied> + || (TARGET_AVX512BW && TARGET_AVX512VL) + || (<MODE>mode != V32QImode && (<MODE>mode != V16QImode)))" { /* For AVX, normal *mov<mode>_internal pattern will handle unaligned loads just fine if misaligned_operand is true, and without the UNSPEC it can @@ -1197,20 +1208,27 @@ (unspec:VI_UNALIGNED_LOADSTORE [(match_operand:VI_UNALIGNED_LOADSTORE 1 "nonimmediate_operand" "vm")] UNSPEC_LOADU))] - "TARGET_SSE2 && <mask_mode512bit_condition>" + "TARGET_SSE2 + && (!<mask_applied> + || (TARGET_AVX512BW && TARGET_AVX512VL) + || (<MODE>mode != V32QImode && (<MODE>mode != V16QImode)))" { switch (get_attr_mode (insn)) { + case MODE_V16SF: case MODE_V8SF: case MODE_V4SF: return "%vmovups\t{%1, %0|%0, %1}"; - case MODE_XI: - if (<MODE>mode == V8DImode) - return "vmovdqu64\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"; - else - return "vmovdqu32\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"; default: - return "%vmovdqu\t{%1, %0|%0, %1}"; + switch (<MODE>mode) + { + case V32QImode: + case V16QImode: + if (!(TARGET_AVX512VL && TARGET_AVX512BW)) + return "%vmovdqu\t{%1, %0|%0, %1}"; + default: + return "vmovdqu<ssescalarsize>\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"; + } } } [(set_attr "type" "ssemov") @@ -1246,13 +1264,16 @@ case MODE_V8SF: case MODE_V4SF: return "%vmovups\t{%1, %0|%0, %1}"; - case MODE_XI: - if (<MODE>mode == V8DImode) - return "vmovdqu64\t{%1, %0|%0, %1}"; - else - return "vmovdqu32\t{%1, %0|%0, %1}"; default: - return "%vmovdqu\t{%1, %0|%0, %1}"; + switch (<MODE>mode) + { + case V32QImode: + case V16QImode: + if (!(TARGET_AVX512VL && TARGET_AVX512BW)) + return "%vmovdqu\t{%1, %0|%0, %1}"; + default: + return "vmovdqu<ssescalarsize>\t{%1, %0|%0, %1}"; + } } } [(set_attr "type" "ssemov") @@ -1276,21 +1297,32 @@ ] (const_string "<sseinsnmode>")))]) -(define_insn "avx512f_storedqu<mode>_mask" - [(set (match_operand:VI48_512 0 "memory_operand" "=m") - (vec_merge:VI48_512 - (unspec:VI48_512 - [(match_operand:VI48_512 1 "register_operand" "v")] +(define_insn "<avx512>_storedqu<mode>_mask" + [(set (match_operand:VI48_AVX512VL 0 "memory_operand" "=m") + (vec_merge:VI48_AVX512VL + (unspec:VI48_AVX512VL + [(match_operand:VI48_AVX512VL 1 "register_operand" "v")] UNSPEC_STOREU) (match_dup 0) (match_operand:<avx512fmaskmode> 2 "register_operand" "Yk")))] "TARGET_AVX512F" -{ - if (<MODE>mode == V8DImode) - return "vmovdqu64\t{%1, %0%{%2%}|%0%{%2%}, %1}"; - else - return "vmovdqu32\t{%1, %0%{%2%}|%0%{%2%}, %1}"; -} + "vmovdqu<ssescalarsize>\t{%1, %0%{%2%}|%0%{%2%}, %1}" + [(set_attr "type" "ssemov") + (set_attr "movu" "1") + (set_attr "memory" "store") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "<avx512>_storedqu<mode>_mask" + [(set (match_operand:VI12_AVX512VL 0 "memory_operand" "=m") + (vec_merge:VI12_AVX512VL + (unspec:VI12_AVX512VL + [(match_operand:VI12_AVX512VL 1 "register_operand" "v")] + UNSPEC_STOREU) + (match_dup 0) + (match_operand:<avx512fmaskmode> 2 "register_operand" "Yk")))] + "TARGET_AVX512BW" + "vmovdqu<ssescalarsize>\t{%1, %0%{%2%}|%0%{%2%}, %1}" [(set_attr "type" "ssemov") (set_attr "movu" "1") (set_attr "memory" "store")