Re: [PATCH 5/6] mips: Implement vec_perm_const.

Richard Henderson Sun, 11 Dec 2011 11:28:26 -0800

On 12/11/2011 04:50 AM, Richard Sandiford wrote:
> [Mingjie, please could you help with the Loongson question near the end?]


Actually, can you tell me how to test these abi combinations?  I keep trying to 
use mips-sim or mips64-sim and get linker errors complaining of abi 
combinations.

>      Little-endian:
> 
>         The semantics of the RTL pattern are:
> 
>       { 0L, 0U } = { X[I3], X[I4 + 2] }, where X = { 1L, 1U, 2L, 2U }
> 
>       so: 0L = { 1L, 1U }[I3] (= <bop><bUL>)
>           0U = { 2L, 2U }[I4] (= <aop><aUL>)
> 
>           <aop> = 2, <aUL> = I4 ? U : L
>           <bop> = 1, <bUL> = I3 ? U : L
> 
>           [LL] !I4 && !I3   [UL] I4 && !I3
>           [LU] !I4 && I3    [UU] I4 && I3
> 
>      Big-endian:
> 
>         The semantics of the RTL pattern are:
> 
>       { 0U, 0L } = { X[I3], X[I4 + 2] }, where X = { 1U, 1L, 2U, 2L }
> 
>       so: 0U = { 1U, 1L }[I3] (= <aop><aUL>)
>           0L = { 2U, 2L }[I4] (= <bop><bUL>)
> 
>           <aop> = 1, <aUL> = I3 ? L : U
>           <bop> = 2, <bUL> = I4 ? L : U
> 
>           [UU] !I3 && !I4   [UL] !I3 && I4
>           [LU] I3 && !I4    [LL] I3 && I4.  */
> 
> which suggests that the PUL and PLU entries for big-endian should be
> the other way around.  Does that sound right, or have I misunderstood?

Yes, that sounds right.

> ...for little-endian, we need to pass the "U" and "L" components of the
> mnemonic in the reverse order: the MIPS instruction specifies the upper
> part first, whereas the rtl pattern specifies the lower part first.
> And for little-endian, U refers to memory element 1 and L to memory
> element 0.  So I think this should be:

... Except that the actual output of the LE insn actually swaps the operands 
too.  So I think these expanders should not *also* swap the operands.  I've 
tidied these up a bit since then.

>> +static bool
>> +mips_expand_vpc_ps (struct expand_vec_perm_d *d)

I've eliminated this function since then.

>> +  /* Convert the selector into the packed 8-bit form for pshufh.  */
>> +  for (i = mask = 0; i < 4; i++)
>> +    mask |= (d->perm[i] & 3) << (i * 2);
> 
> I think this is endian-dependent.  For little-endian, the bottom two bits
> of the mask determine element 0; for big-endian, the top two bits of the
> mask do. 

Recall that loongson can only run in little-endian.  I added comments about 
that in the md file, but it would do no harm to add another here.

> (There's a machine in the farm, but bootstrapping on it is rather slow.)

Yeah, I started checking out the tree there yesterday and it never completed.

> I think a lot of the endianness stuff in the patch is dependent on byte
> endianness rather than word endianness.  Since we only support two out
> of the four combinations, it seems better not to worry which and simply
> use TARGET_{BIG,LITTLE}_ENDIAN instead of {WORDS,BYTES}_{BIG,LITTLE}_ENDIAN.

Sure.

This is my current patch, which doesn't have the pul/plu insns swapped, as 
suggested above.  I did change the loongson.h interface as H-P suggested.


r~

commit b7790c7a9e53d66d1f348c3f2adb5b8a9bf2d93c
Author: Richard Henderson <r...@redhat.com>
Date:   Wed Dec 7 14:17:02 2011 -0800

    mips: Implement vec_perm_const.

diff --git a/gcc/config/mips/loongson.h b/gcc/config/mips/loongson.h
index 6bfd4d7..dfd6505 100644
--- a/gcc/config/mips/loongson.h
+++ b/gcc/config/mips/loongson.h
@@ -447,15 +447,15 @@ psadbh (uint8x8_t s, uint8x8_t t)
 
 /* Shuffle halfwords.  */
 __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-pshufh_u (uint16x4_t dest, uint16x4_t s, uint8_t order)
+pshufh_u (uint16x4_t s, uint8_t order)
 {
-  return __builtin_loongson_pshufh_u (dest, s, order);
+  return __builtin_loongson_pshufh_u (s, order);
 }
 
 __extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-pshufh_s (int16x4_t dest, int16x4_t s, uint8_t order)
+pshufh_s (int16x4_t s, uint8_t order)
 {
-  return __builtin_loongson_pshufh_s (dest, s, order);
+  return __builtin_loongson_pshufh_s (s, order);
 }
 
 /* Shift left logical.  */
diff --git a/gcc/config/mips/loongson.md b/gcc/config/mips/loongson.md
index 225f4d1..7c7e29f 100644
--- a/gcc/config/mips/loongson.md
+++ b/gcc/config/mips/loongson.md
@@ -24,10 +24,7 @@
   UNSPEC_LOONGSON_PCMPEQ
   UNSPEC_LOONGSON_PCMPGT
   UNSPEC_LOONGSON_PEXTR
-  UNSPEC_LOONGSON_PINSR_0
-  UNSPEC_LOONGSON_PINSR_1
-  UNSPEC_LOONGSON_PINSR_2
-  UNSPEC_LOONGSON_PINSR_3
+  UNSPEC_LOONGSON_PINSRH
   UNSPEC_LOONGSON_PMADD
   UNSPEC_LOONGSON_PMOVMSK
   UNSPEC_LOONGSON_PMULHU
@@ -200,6 +197,51 @@
   "pandn\t%0,%1,%2"
   [(set_attr "type" "fmul")])
 
+;; Logical AND.
+(define_insn "*loongson_and"
+  [(set (match_operand:VWHB 0 "register_operand" "=f")
+       (and:VWHB (match_operand:VWHB 1 "register_operand" "f")
+                 (match_operand:VWHB 2 "register_operand" "f")))]
+  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
+  "and\t%0,%1,%2"
+  [(set_attr "type" "fmul")])
+
+;; Logical OR.
+(define_insn "*loongson_or"
+  [(set (match_operand:VWHB 0 "register_operand" "=f")
+       (ior:VWHB (match_operand:VWHB 1 "register_operand" "f")
+                 (match_operand:VWHB 2 "register_operand" "f")))]
+  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
+  "or\t%0,%1,%2"
+  [(set_attr "type" "fmul")])
+
+;; Logical XOR.
+(define_insn "*loongson_xor"
+  [(set (match_operand:VWHB 0 "register_operand" "=f")
+       (xor:VWHB (match_operand:VWHB 1 "register_operand" "f")
+                 (match_operand:VWHB 2 "register_operand" "f")))]
+  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
+  "xor\t%0,%1,%2"
+  [(set_attr "type" "fmul")])
+
+;; Logical NOR.
+(define_insn "*loongson_nor"
+  [(set (match_operand:VWHB 0 "register_operand" "=f")
+       (and:VWHB
+         (not:VWHB (match_operand:VWHB 1 "register_operand" "f"))
+         (not:VWHB (match_operand:VWHB 2 "register_operand" "f"))))]
+  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
+  "nor\t%0,%1,%2"
+  [(set_attr "type" "fmul")])
+
+;; Logical NOT.
+(define_insn "*loongson_not"
+  [(set (match_operand:VWHB 0 "register_operand" "=f")
+       (not:VWHB (match_operand:VWHB 1 "register_operand" "f")))]
+  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
+  "nor\t%0,%1,%1"
+  [(set_attr "type" "fmul")])
+
 ;; Average.
 (define_insn "loongson_pavg<V_suffix>"
   [(set (match_operand:VHB 0 "register_operand" "=f")
@@ -231,52 +273,110 @@
   [(set_attr "type" "fadd")])
 
 ;; Extract halfword.
-(define_insn "loongson_pextr<V_suffix>"
-  [(set (match_operand:VH 0 "register_operand" "=f")
-        (unspec:VH [(match_operand:VH 1 "register_operand" "f")
-                   (match_operand:SI 2 "register_operand" "f")]
+(define_insn "loongson_pextrh"
+  [(set (match_operand:V4HI 0 "register_operand" "=f")
+        (unspec:V4HI [(match_operand:V4HI 1 "register_operand" "f")
+                     (match_operand:SI 2 "register_operand" "f")]
                   UNSPEC_LOONGSON_PEXTR))]
   "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
   "pextr<V_suffix>\t%0,%1,%2"
   [(set_attr "type" "fmul")])
 
+(define_insn "vec_extractv4hi_1"
+  [(set (match_operand:SI 0 "register_operand" "=f")
+       (zero_extend:SI
+         (unspec:HI
+           [(match_operand:V4HI 1 "register_operand" "f")
+            (match_operand:SI 2 "register_operand" "f")]
+           UNSPEC_LOONGSON_PEXTR)))]
+  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
+  "pextr<V_suffix>\t%0,%1,%2"
+  [(set_attr "type" "fmul")])
+
+(define_expand "vec_extractv4hi"
+  [(match_operand:HI 0 "register_operand" "")
+   (match_operand:V4HI 1 "register_operand" "")
+   (match_operand:SI 2 "register_operand" "")]
+  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
+{
+  rtx x = gen_reg_rtx (SImode);
+  emit_insn (gen_vec_extractv4hi_1 (x, operands[1], operands[2]));
+  emit_move_insn (operands[0], gen_lowpart (HImode, x));
+  DONE;
+})
+
 ;; Insert halfword.
-(define_insn "loongson_pinsr<V_suffix>_0"
-  [(set (match_operand:VH 0 "register_operand" "=f")
-        (unspec:VH [(match_operand:VH 1 "register_operand" "f")
-                   (match_operand:VH 2 "register_operand" "f")]
-                  UNSPEC_LOONGSON_PINSR_0))]
+(define_insn "loongson_pinsrh_0"
+  [(set (match_operand:V4HI 0 "register_operand" "=f")
+       (vec_select:V4HI
+         (vec_concat:V8HI
+           (match_operand:V4HI 1 "register_operand" "f")
+           (match_operand:V4HI 2 "register_operand" "f"))
+         (parallel [(const_int 4) (const_int 1)
+                    (const_int 2) (const_int 3)])))]
+  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
+  "pinsrh_0\t%0,%1,%2"
+  [(set_attr "type" "fdiv")])
+
+(define_insn "loongson_pinsrh_1"
+  [(set (match_operand:V4HI 0 "register_operand" "=f")
+       (vec_select:V4HI
+         (vec_concat:V8HI
+           (match_operand:V4HI 1 "register_operand" "f")
+           (match_operand:V4HI 2 "register_operand" "f"))
+         (parallel [(const_int 0) (const_int 4)
+                    (const_int 2) (const_int 3)])))]
   "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
-  "pinsr<V_suffix>_0\t%0,%1,%2"
+  "pinsrh_1\t%0,%1,%2"
   [(set_attr "type" "fdiv")])
 
-(define_insn "loongson_pinsr<V_suffix>_1"
-  [(set (match_operand:VH 0 "register_operand" "=f")
-        (unspec:VH [(match_operand:VH 1 "register_operand" "f")
-                   (match_operand:VH 2 "register_operand" "f")]
-                  UNSPEC_LOONGSON_PINSR_1))]
+(define_insn "loongson_pinsrh_2"
+  [(set (match_operand:V4HI 0 "register_operand" "=f")
+       (vec_select:V4HI
+         (vec_concat:V8HI
+           (match_operand:V4HI 1 "register_operand" "f")
+           (match_operand:V4HI 2 "register_operand" "f"))
+         (parallel [(const_int 0) (const_int 1)
+                    (const_int 4) (const_int 3)])))]
   "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
-  "pinsr<V_suffix>_1\t%0,%1,%2"
+  "pinsrh_2\t%0,%1,%2"
   [(set_attr "type" "fdiv")])
 
-(define_insn "loongson_pinsr<V_suffix>_2"
-  [(set (match_operand:VH 0 "register_operand" "=f")
-        (unspec:VH [(match_operand:VH 1 "register_operand" "f")
-                   (match_operand:VH 2 "register_operand" "f")]
-                  UNSPEC_LOONGSON_PINSR_2))]
+(define_insn "loongson_pinsrh_3"
+  [(set (match_operand:V4HI 0 "register_operand" "=f")
+       (vec_select:V4HI
+         (vec_concat:V8HI
+           (match_operand:V4HI 1 "register_operand" "f")
+           (match_operand:V4HI 2 "register_operand" "f"))
+         (parallel [(const_int 0) (const_int 1)
+                    (const_int 2) (const_int 4)])))]
   "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
-  "pinsr<V_suffix>_2\t%0,%1,%2"
+  "pinsrh_3\t%0,%1,%2"
   [(set_attr "type" "fdiv")])
 
-(define_insn "loongson_pinsr<V_suffix>_3"
-  [(set (match_operand:VH 0 "register_operand" "=f")
-        (unspec:VH [(match_operand:VH 1 "register_operand" "f")
-                   (match_operand:VH 2 "register_operand" "f")]
-                  UNSPEC_LOONGSON_PINSR_3))]
+(define_insn "*vec_setv4hi"
+  [(set (match_operand:V4HI 0 "register_operand" "=f")
+       (unspec:V4HI [(match_operand:V4HI 1 "register_operand" "f")
+                     (match_operand:SI 2 "register_operand" "f")
+                     (match_operand:SI 3 "const_0_to_3_operand" "")]
+                    UNSPEC_LOONGSON_PINSRH))]
   "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
-  "pinsr<V_suffix>_3\t%0,%1,%2"
+  "pinsrh_%3\t%0,%1,%2"
   [(set_attr "type" "fdiv")])
 
+(define_expand "vec_setv4hi"
+  [(set (match_operand:V4HI 0 "register_operand" "=f")
+       (unspec:V4HI [(match_operand:V4HI 1 "register_operand" "f")
+                     (match_operand:HI 2 "register_operand" "f")
+                     (match_operand:SI 3 "const_0_to_3_operand" "")]
+                    UNSPEC_LOONGSON_PINSRH))]
+  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
+{
+  rtx ext = gen_reg_rtx (SImode);
+  emit_move_insn (ext, gen_lowpart (SImode, operands[1]));
+  operands[1] = ext;
+})
+
 ;; Multiply and add packed integers.
 (define_insn "loongson_pmadd<V_stretch_half_suffix>"
   [(set (match_operand:<V_stretch_half> 0 "register_operand" "=f")
@@ -403,12 +503,11 @@
 ;; Shuffle halfwords.
 (define_insn "loongson_pshufh"
   [(set (match_operand:VH 0 "register_operand" "=f")
-        (unspec:VH [(match_operand:VH 1 "register_operand" "0")
-                   (match_operand:VH 2 "register_operand" "f")
-                   (match_operand:SI 3 "register_operand" "f")]
+        (unspec:VH [(match_operand:VH 1 "register_operand" "f")
+                   (match_operand:SI 2 "register_operand" "f")]
                   UNSPEC_LOONGSON_PSHUFH))]
   "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
-  "pshufh\t%0,%2,%3"
+  "pshufh\t%0,%1,%2"
   [(set_attr "type" "fmul")])
 
 ;; Shift left logical.
@@ -478,26 +577,95 @@
   "psubus<V_suffix>\t%0,%1,%2"
   [(set_attr "type" "fadd")])
 
-;; Unpack high data.
-(define_insn "vec_interleave_high<mode>"
-  [(set (match_operand:VWHB 0 "register_operand" "=f")
-        (unspec:VWHB [(match_operand:VWHB 1 "register_operand" "f")
-                     (match_operand:VWHB 2 "register_operand" "f")]
-                    UNSPEC_LOONGSON_PUNPCKH))]
+;; Unpack high data.  Recall that Loongson only runs in little-endian.
+(define_insn "loongson_punpckhbh"
+  [(set (match_operand:V8QI 0 "register_operand" "=f")
+       (vec_select:V8QI
+         (vec_concat:V16QI
+           (match_operand:V8QI 1 "register_operand" "f")
+           (match_operand:V8QI 2 "register_operand" "f"))
+         (parallel [(const_int 4) (const_int 12)
+                    (const_int 5) (const_int 13)
+                    (const_int 6) (const_int 14)
+                    (const_int 7) (const_int 15)])))]
+  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
+  "punpckhbh\t%0,%1,%2"
+  [(set_attr "type" "fdiv")])
+
+(define_insn "loongson_punpckhhw"
+  [(set (match_operand:V4HI 0 "register_operand" "=f")
+       (vec_select:V4HI
+         (vec_concat:V8HI
+           (match_operand:V4HI 1 "register_operand" "f")
+           (match_operand:V4HI 2 "register_operand" "f"))
+         (parallel [(const_int 2) (const_int 6)
+                    (const_int 3) (const_int 7)])))]
   "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
-  "punpckh<V_stretch_half_suffix>\t%0,%1,%2"
+  "punpckhhw\t%0,%1,%2"
+  [(set_attr "type" "fdiv")])
+
+(define_insn "loongson_punpckhwd"
+  [(set (match_operand:V2SI 0 "register_operand" "=f")
+       (vec_select:V2SI
+         (vec_concat:V4SI
+           (match_operand:V2SI 1 "register_operand" "f")
+           (match_operand:V2SI 2 "register_operand" "f"))
+         (parallel [(const_int 1) (const_int 3)])))]
+  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
+  "punpckhwd\t%0,%1,%2"
   [(set_attr "type" "fdiv")])
 
 ;; Unpack low data.
-(define_insn "vec_interleave_low<mode>"
-  [(set (match_operand:VWHB 0 "register_operand" "=f")
-        (unspec:VWHB [(match_operand:VWHB 1 "register_operand" "f")
-                     (match_operand:VWHB 2 "register_operand" "f")]
-                    UNSPEC_LOONGSON_PUNPCKL))]
+(define_insn "loongson_punpcklbh"
+  [(set (match_operand:V8QI 0 "register_operand" "=f")
+       (vec_select:V8QI
+         (vec_concat:V16QI
+           (match_operand:V8QI 1 "register_operand" "f")
+           (match_operand:V8QI 2 "register_operand" "f"))
+         (parallel [(const_int 0) (const_int 8)
+                    (const_int 1) (const_int 9)
+                    (const_int 2) (const_int 10)
+                    (const_int 3) (const_int 11)])))]
+  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
+  "punpcklbh\t%0,%1,%2"
+  [(set_attr "type" "fdiv")])
+
+(define_insn "loongson_punpcklhw"
+  [(set (match_operand:V4HI 0 "register_operand" "=f")
+       (vec_select:V4HI
+         (vec_concat:V8HI
+           (match_operand:V4HI 1 "register_operand" "f")
+           (match_operand:V4HI 2 "register_operand" "f"))
+         (parallel [(const_int 0) (const_int 4)
+                    (const_int 1) (const_int 5)])))]
   "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
-  "punpckl<V_stretch_half_suffix>\t%0,%1,%2"
+  "punpcklhw\t%0,%1,%2"
   [(set_attr "type" "fdiv")])
 
+(define_insn "loongson_punpcklwd"
+  [(set (match_operand:V2SI 0 "register_operand" "=f")
+       (vec_select:V2SI
+         (vec_concat:V4SI
+           (match_operand:V2SI 1 "register_operand" "f")
+           (match_operand:V2SI 2 "register_operand" "f"))
+         (parallel [(const_int 0) (const_int 2)])))]
+  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
+  "punpcklwd\t%0,%1,%2"
+  [(set_attr "type" "fdiv")])
+
+(define_expand "vec_perm_const<mode>"
+  [(match_operand:VWHB 0 "register_operand" "")
+   (match_operand:VWHB 1 "register_operand" "")
+   (match_operand:VWHB 2 "register_operand" "")
+   (match_operand:VWHB 3 "" "")]
+  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
+{
+  if (mips_expand_vec_perm_const (operands))
+    DONE;
+  else
+    FAIL;
+})
+
 ;; Integer division and modulus.  For integer multiplication, see mips.md.
 
 (define_insn "<u>div<mode>3"
diff --git a/gcc/config/mips/mips-modes.def b/gcc/config/mips/mips-modes.def
index b9c508b..85861a9 100644
--- a/gcc/config/mips/mips-modes.def
+++ b/gcc/config/mips/mips-modes.def
@@ -26,9 +26,15 @@ RESET_FLOAT_FORMAT (DF, mips_double_format);
 FLOAT_MODE (TF, 16, mips_quad_format);
 
 /* Vector modes.  */
-VECTOR_MODES (INT, 8);        /*       V8QI V4HI V2SI */
-VECTOR_MODES (FLOAT, 8);      /*            V4HF V2SF */
-VECTOR_MODES (INT, 4);        /*            V4QI V2HI */
+VECTOR_MODES (INT, 8);        /*       V8QI  V4HI V2SI */
+VECTOR_MODES (FLOAT, 8);      /*             V4HF V2SF */
+VECTOR_MODES (INT, 4);        /*             V4QI V2HI */
+
+/* Double-sized vector modes for vec_concat.  */
+VECTOR_MODE (INT, QI, 16);
+VECTOR_MODE (INT, HI, 8);
+VECTOR_MODE (INT, SI, 4);
+VECTOR_MODE (FLOAT, SF, 4);
 
 VECTOR_MODES (FRACT, 4);       /* V4QQ  V2HQ */
 VECTOR_MODES (UFRACT, 4);      /* V4UQQ V2UHQ */
diff --git a/gcc/config/mips/mips-protos.h b/gcc/config/mips/mips-protos.h
index dbabdff..37c958d 100644
--- a/gcc/config/mips/mips-protos.h
+++ b/gcc/config/mips/mips-protos.h
@@ -328,6 +328,7 @@ extern void mips_expand_atomic_qihi (union mips_gen_fn_ptrs,
                                     rtx, rtx, rtx, rtx);
 
 extern void mips_expand_vector_init (rtx, rtx);
+extern bool mips_expand_vec_perm_const (rtx op[4]);
 
 extern bool mips_eh_uses (unsigned int);
 extern bool mips_epilogue_uses (unsigned int);
diff --git a/gcc/config/mips/mips-ps-3d.md b/gcc/config/mips/mips-ps-3d.md
index 504f43c..1b07f12 100644
--- a/gcc/config/mips/mips-ps-3d.md
+++ b/gcc/config/mips/mips-ps-3d.md
@@ -89,61 +89,94 @@
   DONE;
 })
 
-; pul.ps - Pair Upper Lower
-(define_insn "mips_pul_ps"
+(define_insn "*vec_perm_const_ps"
   [(set (match_operand:V2SF 0 "register_operand" "=f")
-       (vec_merge:V2SF
-        (match_operand:V2SF 1 "register_operand" "f")
-        (match_operand:V2SF 2 "register_operand" "f")
-        (const_int 2)))]
+       (vec_select:V2SF
+         (vec_concat:V4SF
+           (match_operand:V2SF 1 "register_operand" "f")
+           (match_operand:V2SF 2 "register_operand" "f"))
+         (parallel [(match_operand:SI 3 "const_0_or_1_operand" "")
+                    (match_operand:SI 4 "const_2_or_3_operand" "")])))]
   "TARGET_HARD_FLOAT && TARGET_PAIRED_SINGLE_FLOAT"
-  "pul.ps\t%0,%1,%2"
+{
+  static const char * const mnemonics[2][4] = {
+    /* LE */ { "pll.ps\t%0,%2,%1", "pul.ps\t%0,%2,%1",
+              "plu.ps\t%0,%2,%1", "puu.ps\t%0,%2,%1" },
+    /* BE */ { "puu.ps\t%0,%1,%2", "plu.ps\t%0,%1,%2",
+              "pul.ps\t%0,%1,%2", "pll.ps\t%0,%1,%2" },
+  };
+
+  unsigned mask = INTVAL (operands[3]) * 2 + (INTVAL (operands[4]) - 2);
+  return mnemonics[BYTES_BIG_ENDIAN][mask];
+}
   [(set_attr "type" "fmove")
    (set_attr "mode" "SF")])
 
-; puu.ps - Pair upper upper
-(define_insn "mips_puu_ps"
+(define_expand "vec_perm_constv2sf"
+  [(match_operand:V2SF 0 "register_operand" "")
+   (match_operand:V2SF 1 "register_operand" "")
+   (match_operand:V2SF 2 "register_operand" "")
+   (match_operand:V2SI 3 "" "")]
+  "TARGET_HARD_FLOAT && TARGET_PAIRED_SINGLE_FLOAT"
+{
+  if (mips_expand_vec_perm_const (operands))
+    DONE;
+  else
+    FAIL;
+})
+
+;; Expanders for builtins
+(define_expand "mips_puu_ps"
   [(set (match_operand:V2SF 0 "register_operand" "=f")
-       (vec_merge:V2SF
-        (match_operand:V2SF 1 "register_operand" "f")
-        (vec_select:V2SF (match_operand:V2SF 2 "register_operand" "f")
-                         (parallel [(const_int 1)
-                                    (const_int 0)]))
-        (const_int 2)))]
-  "TARGET_HARD_FLOAT && TARGET_PAIRED_SINGLE_FLOAT"
-  "puu.ps\t%0,%1,%2"
-  [(set_attr "type" "fmove")
-   (set_attr "mode" "SF")])
+       (vec_select:V2SF
+         (vec_concat:V4SF
+           (match_operand:V2SF 1 "register_operand" "f")
+           (match_operand:V2SF 2 "register_operand" "f"))
+         (parallel [(match_dup 3) (match_dup 4)])))]
+  "TARGET_HARD_FLOAT && TARGET_PAIRED_SINGLE_FLOAT"
+{
+  operands[3] = (BYTES_BIG_ENDIAN ? const0_rtx : const1_rtx);
+  operands[4] = (BYTES_BIG_ENDIAN ? const2_rtx : GEN_INT (3));
+})
 
-; pll.ps - Pair Lower Lower
-(define_insn "mips_pll_ps"
+(define_expand "mips_pul_ps"
   [(set (match_operand:V2SF 0 "register_operand" "=f")
-       (vec_merge:V2SF
-        (vec_select:V2SF (match_operand:V2SF 1 "register_operand" "f")
-                         (parallel [(const_int 1)
-                                    (const_int 0)]))
-        (match_operand:V2SF 2 "register_operand" "f")
-        (const_int 2)))]
-  "TARGET_HARD_FLOAT && TARGET_PAIRED_SINGLE_FLOAT"
-  "pll.ps\t%0,%1,%2"
-  [(set_attr "type" "fmove")
-   (set_attr "mode" "SF")])
+       (vec_select:V2SF
+         (vec_concat:V4SF
+           (match_operand:V2SF 1 "register_operand" "f")
+           (match_operand:V2SF 2 "register_operand" "f"))
+         (parallel [(match_dup 3) (match_dup 4)])))]
+  "TARGET_HARD_FLOAT && TARGET_PAIRED_SINGLE_FLOAT"
+{
+  operands[3] = (BYTES_BIG_ENDIAN ? const1_rtx : const0_rtx);
+  operands[4] = (BYTES_BIG_ENDIAN ? const2_rtx : GEN_INT (3));
+})
 
-; plu.ps - Pair Lower Upper
-(define_insn "mips_plu_ps"
+(define_expand "mips_plu_ps"
   [(set (match_operand:V2SF 0 "register_operand" "=f")
-       (vec_merge:V2SF
-        (vec_select:V2SF (match_operand:V2SF 1 "register_operand" "f")
-                         (parallel [(const_int 1)
-                                    (const_int 0)]))
-        (vec_select:V2SF (match_operand:V2SF 2 "register_operand" "f")
-                         (parallel [(const_int 1)
-                                    (const_int 0)]))
-        (const_int 2)))]
-  "TARGET_HARD_FLOAT && TARGET_PAIRED_SINGLE_FLOAT"
-  "plu.ps\t%0,%1,%2"
-  [(set_attr "type" "fmove")
-   (set_attr "mode" "SF")])
+       (vec_select:V2SF
+         (vec_concat:V4SF
+           (match_operand:V2SF 1 "register_operand" "f")
+           (match_operand:V2SF 2 "register_operand" "f"))
+         (parallel [(match_dup 3) (match_dup 4)])))]
+  "TARGET_HARD_FLOAT && TARGET_PAIRED_SINGLE_FLOAT"
+{
+  operands[3] = (BYTES_BIG_ENDIAN ? const0_rtx : const1_rtx);
+  operands[4] = (BYTES_BIG_ENDIAN ? GEN_INT (3) : const2_rtx);
+})
+
+(define_expand "mips_pll_ps"
+  [(set (match_operand:V2SF 0 "register_operand" "=f")
+       (vec_select:V2SF
+         (vec_concat:V4SF
+           (match_operand:V2SF 1 "register_operand" "f")
+           (match_operand:V2SF 2 "register_operand" "f"))
+         (parallel [(match_dup 3) (match_dup 4)])))]
+  "TARGET_HARD_FLOAT && TARGET_PAIRED_SINGLE_FLOAT"
+{
+  operands[3] = (BYTES_BIG_ENDIAN ? const1_rtx : const0_rtx);
+  operands[4] = (BYTES_BIG_ENDIAN ? GEN_INT (3) : const2_rtx);
+})
 
 ; vec_init
 (define_expand "vec_initv2sf"
@@ -195,22 +228,21 @@
 ;; no other way to get a vector mode bitfield store currently.
 
 (define_expand "vec_setv2sf"
-  [(match_operand:V2SF 0 "register_operand")
-   (match_operand:SF 1 "register_operand")
-   (match_operand 2 "const_0_or_1_operand")]
+  [(set (match_operand:V2SF 0 "register_operand" "")
+       (vec_select:V2SF
+         (vec_concat:V4SF
+           (match_operand:SF 1 "register_operand" "")
+           (match_dup 0))
+         (parallel [(match_operand 2 "const_0_or_1_operand" "")
+                    (match_dup 3)])))]
   "TARGET_HARD_FLOAT && TARGET_PAIRED_SINGLE_FLOAT"
 {
-  rtx temp;
-
   /* We don't have an insert instruction, so we duplicate the float, and
      then use a PUL instruction.  */
-  temp = gen_reg_rtx (V2SFmode);
+  rtx temp = gen_reg_rtx (V2SFmode);
   emit_insn (gen_mips_cvt_ps_s (temp, operands[1], operands[1]));
-  if (INTVAL (operands[2]) == !BYTES_BIG_ENDIAN)
-    emit_insn (gen_mips_pul_ps (operands[0], temp, operands[0]));
-  else
-    emit_insn (gen_mips_pul_ps (operands[0], operands[0], temp));
-  DONE;
+  operands[1] = temp;
+  operands[3] = GEN_INT (1 - INTVAL (operands[2]) + 2);
 })
 
 ; cvt.ps.s - Floating Point Convert Pair to Paired Single
diff --git a/gcc/config/mips/mips.c b/gcc/config/mips/mips.c
index d3fd709..2f2578a 100644
--- a/gcc/config/mips/mips.c
+++ b/gcc/config/mips/mips.c
@@ -12774,12 +12774,6 @@ AVAIL_NON_MIPS16 (cache, TARGET_CACHE_BUILTIN)
 #define CODE_FOR_loongson_psubsb CODE_FOR_sssubv8qi3
 #define CODE_FOR_loongson_psubush CODE_FOR_ussubv4hi3
 #define CODE_FOR_loongson_psubusb CODE_FOR_ussubv8qi3
-#define CODE_FOR_loongson_punpckhbh CODE_FOR_vec_interleave_highv8qi
-#define CODE_FOR_loongson_punpckhhw CODE_FOR_vec_interleave_highv4hi
-#define CODE_FOR_loongson_punpckhwd CODE_FOR_vec_interleave_highv2si
-#define CODE_FOR_loongson_punpcklbh CODE_FOR_vec_interleave_lowv8qi
-#define CODE_FOR_loongson_punpcklhw CODE_FOR_vec_interleave_lowv4hi
-#define CODE_FOR_loongson_punpcklwd CODE_FOR_vec_interleave_lowv2si
 
 static const struct mips_builtin_description mips_builtins[] = {
   DIRECT_BUILTIN (pll_ps, MIPS_V2SF_FTYPE_V2SF_V2SF, paired_single),
@@ -13021,8 +13015,8 @@ static const struct mips_builtin_description 
mips_builtins[] = {
   LOONGSON_BUILTIN (pasubub, MIPS_UV8QI_FTYPE_UV8QI_UV8QI),
   LOONGSON_BUILTIN (biadd, MIPS_UV4HI_FTYPE_UV8QI),
   LOONGSON_BUILTIN (psadbh, MIPS_UV4HI_FTYPE_UV8QI_UV8QI),
-  LOONGSON_BUILTIN_SUFFIX (pshufh, u, MIPS_UV4HI_FTYPE_UV4HI_UV4HI_UQI),
-  LOONGSON_BUILTIN_SUFFIX (pshufh, s, MIPS_V4HI_FTYPE_V4HI_V4HI_UQI),
+  LOONGSON_BUILTIN_SUFFIX (pshufh, u, MIPS_UV4HI_FTYPE_UV4HI_UQI),
+  LOONGSON_BUILTIN_SUFFIX (pshufh, s, MIPS_V4HI_FTYPE_V4HI_UQI),
   LOONGSON_BUILTIN_SUFFIX (psllh, u, MIPS_UV4HI_FTYPE_UV4HI_UQI),
   LOONGSON_BUILTIN_SUFFIX (psllh, s, MIPS_V4HI_FTYPE_V4HI_UQI),
   LOONGSON_BUILTIN_SUFFIX (psllw, u, MIPS_UV2SI_FTYPE_UV2SI_UQI),
@@ -16326,6 +16320,219 @@ mips_shift_truncation_mask (enum machine_mode mode)
 }
 
 
+/* Generate or test for an insn that supports a constant permutation.  */
+
+#define MAX_VECT_LEN 8
+
+struct expand_vec_perm_d
+{
+  rtx target, op0, op1;
+  unsigned char perm[MAX_VECT_LEN];
+  enum machine_mode vmode;
+  unsigned char nelt;
+  bool one_vector_p;
+  bool testing_p;
+};
+
+/* Construct (set target (vec_select op0 (parallel perm))) and
+   return true if that's a valid instruction in the active ISA.  */
+
+static bool
+expand_vselect (rtx target, rtx op0, const unsigned char *perm, unsigned nelt)
+{
+  rtx rperm[MAX_VECT_LEN], x;
+  unsigned i;
+
+  for (i = 0; i < nelt; ++i)
+    rperm[i] = GEN_INT (perm[i]);
+
+  x = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (nelt, rperm));
+  x = gen_rtx_VEC_SELECT (GET_MODE (target), op0, x);
+  x = gen_rtx_SET (VOIDmode, target, x);
+
+  x = emit_insn (x);
+  if (recog_memoized (x) < 0)
+    {
+      remove_insn (x);
+      return false;
+    }
+  return true;
+}
+
+/* Similar, but generate a vec_concat from op0 and op1 as well.  */
+
+static bool
+expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
+                       const unsigned char *perm, unsigned nelt)
+{
+  enum machine_mode v2mode;
+  rtx x;
+
+  v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
+  x = gen_rtx_VEC_CONCAT (v2mode, op0, op1);
+  return expand_vselect (target, x, perm, nelt);
+}
+
+/* Recognize patterns for the Loongson PSHUFH instruction.  */
+
+static bool
+mips_expand_vpc_loongson_pshufh (struct expand_vec_perm_d *d)
+{
+  unsigned i, mask;
+
+  if (!(TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS))
+    return false;
+  if (d->vmode != V4HImode)
+    return false;
+  if (!d->one_vector_p)
+    return false;
+  if (d->testing_p)
+    return true;
+
+  /* Convert the selector into the packed 8-bit form for pshufh.  */
+  for (i = mask = 0; i < 4; i++)
+    mask |= (d->perm[i] & 3) << (i * 2);
+
+  emit_insn (gen_loongson_pshufh (d->target, d->op0,
+                                 force_reg (SImode, GEN_INT (mask))));
+  return true;
+}
+
+static bool
+mips_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
+{
+  unsigned int i, nelt = d->nelt;
+  unsigned char perm2[MAX_VECT_LEN];
+
+  if (d->one_vector_p)
+    {
+      /* Try interleave with alternating operands.  */
+      memcpy (perm2, d->perm, sizeof(perm2));
+      for (i = 1; i < nelt; i += 2)
+       perm2[i] += nelt;
+      if (expand_vselect_vconcat (d->target, d->op0, d->op1, perm2, nelt))
+       return true;
+    }
+  else
+    {
+      if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt))
+       return true;
+
+      /* Try again with swapped operands.  */
+      for (i = 0; i < nelt; ++i)
+       perm2[i] = (d->perm[i] + nelt) & (2 * nelt - 1);
+      if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt))
+       return true;
+    }
+
+  if (mips_expand_vpc_loongson_pshufh (d))
+    return true;
+  return false;
+}
+
+/* Expand a vec_perm_const pattern.  */
+
+bool
+mips_expand_vec_perm_const (rtx operands[4])
+{
+  struct expand_vec_perm_d d;
+  int i, nelt, which;
+  rtx sel;
+
+  d.target = operands[0];
+  d.op0 = operands[1];
+  d.op1 = operands[2];
+  sel = operands[3];
+
+  d.vmode = GET_MODE (d.target);
+  gcc_assert (VECTOR_MODE_P (d.vmode));
+  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
+  d.testing_p = false;
+
+  for (i = which = 0; i < nelt; ++i)
+    {
+      rtx e = XVECEXP (sel, 0, i);
+      int ei = INTVAL (e) & (2 * nelt - 1);
+      which |= (ei < nelt ? 1 : 2);
+      d.perm[i] = ei;
+    }
+
+  switch (which)
+    {
+    default:
+      gcc_unreachable();
+
+    case 3:
+      d.one_vector_p = false;
+      if (!rtx_equal_p (d.op0, d.op1))
+       break;
+
+      /* The backend (vec_select (vec_concat)) patterns are not duplicated
+        for single-operand.  Try once with the original un-folded selector. */
+      if (mips_expand_vec_perm_const_1 (&d))
+       return true;
+
+      /* Try again after folding the selector to a single operand.  */
+      /* FALLTHRU */
+    case 2:
+      for (i = 0; i < nelt; ++i)
+        d.perm[i] &= nelt - 1;
+      d.op0 = d.op1;
+      d.one_vector_p = true;
+      break;
+
+    case 1:
+      d.op1 = d.op0;
+      d.one_vector_p = true;
+      break;
+    }
+
+  return mips_expand_vec_perm_const_1 (&d);
+}
+
+/* Implement TARGET_VECTORIZE_VEC_PERM_CONST_OK.  */
+
+static bool
+mips_vectorize_vec_perm_const_ok (enum machine_mode vmode,
+                                 const unsigned char *sel)
+{
+  struct expand_vec_perm_d d;
+  unsigned int i, nelt, which;
+  bool ret;
+
+  d.vmode = vmode;
+  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
+  d.testing_p = true;
+  memcpy (d.perm, sel, nelt);
+
+  /* Categorize the set of elements in the selector.  */
+  for (i = which = 0; i < nelt; ++i)
+    {
+      unsigned char e = d.perm[i];
+      gcc_assert (e < 2 * nelt);
+      which |= (e < nelt ? 1 : 2);
+    }
+
+  /* For all elements from second vector, fold the elements to first.  */
+  if (which == 2)
+    for (i = 0; i < nelt; ++i)
+      d.perm[i] -= nelt;
+
+  /* Check whether the mask can be applied to the vector type.  */
+  d.one_vector_p = (which != 3);
+
+  d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
+  d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
+  if (!d.one_vector_p)
+    d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
+
+  start_sequence ();
+  ret = mips_expand_vec_perm_const_1 (&d);
+  end_sequence ();
+
+  return ret;
+}
+
 /* Initialize the GCC target structure.  */
 #undef TARGET_ASM_ALIGNED_HI_OP
 #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
@@ -16544,6 +16751,9 @@ mips_shift_truncation_mask (enum machine_mode mode)
 #undef TARGET_SHIFT_TRUNCATION_MASK
 #define TARGET_SHIFT_TRUNCATION_MASK mips_shift_truncation_mask
 
+#undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
+#define TARGET_VECTORIZE_VEC_PERM_CONST_OK mips_vectorize_vec_perm_const_ok
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 #include "gt-mips.h"
diff --git a/gcc/config/mips/mips.h b/gcc/config/mips/mips.h
index ee40cfa..fe1e135 100644
--- a/gcc/config/mips/mips.h
+++ b/gcc/config/mips/mips.h
@@ -517,7 +517,7 @@ struct mips_cpu_info {
                                                                        \
       /* Whether Loongson vector modes are enabled.  */                 \
       if (TARGET_LOONGSON_VECTORS)                                     \
-        builtin_define ("__mips_loongson_vector_rev");                  \
+        builtin_define ("__mips_loongson_vector_rev=2");                \
                                                                        \
       /* Historical Octeon macro.  */                                  \
       if (TARGET_OCTEON)                                               \
diff --git a/gcc/config/mips/predicates.md b/gcc/config/mips/predicates.md
index 5e9398e..b611373 100644
--- a/gcc/config/mips/predicates.md
+++ b/gcc/config/mips/predicates.md
@@ -73,8 +73,15 @@
 ;; This is used for indexing into vectors, and hence only accepts const_int.
 (define_predicate "const_0_or_1_operand"
   (and (match_code "const_int")
-       (ior (match_test "op == CONST0_RTX (GET_MODE (op))")
-           (match_test "op == CONST1_RTX (GET_MODE (op))"))))
+       (match_test "IN_RANGE (INTVAL (op), 0, 1)")))
+
+(define_predicate "const_2_or_3_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 2, 3)")))
+
+(define_predicate "const_0_to_3_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 0, 3)")))
 
 (define_predicate "qi_mask_operand"
   (and (match_code "const_int")

Re: [PATCH 5/6] mips: Implement vec_perm_const.

Reply via email to