This patch adds two define_insn patterns for NEON vabd instruction to
make combine pass recognize expressions matching (vabs (vsub ...))
patterns as vabd.
This patch reduces code size of x264 binary from 649143 to 648343 (800
bytes, or 0.12%) and increases its performance on average by 2.5% on
plain C version of x264 with -O2 -ftree-vectorize.
On SPEC2K it didn't make any difference -- all vabs instructions found
in SPEC2K binaries are either using .f64 mode or scalar .f32 which are
not supported by NEON's vabd.
Regtested with QEMU.
Ok for trunk?
--
Best regards,
Dmitry
2011-07-21 Sevak Sargsyan <sevak.sargs...@ispras.ru>
* config/arm/neon.md (neon_vabd<mode>_2, neon_vabd<mode>_3): New define_insn patterns for combine.
gcc/testsuite:
* gcc.target/arm/neon-combine-sub-abs-into-vabd.c: New test.
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index a8c1b87..f457365 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -5607,3 +5607,32 @@
emit_insn (gen_neon_vec_pack_trunc_<V_double> (operands[0], tempreg));
DONE;
})
+
+(define_insn "neon_vabd<mode>_2"
+ [(set (match_operand:VDQ 0 "s_register_operand" "=w")
+ (abs:VDQ (minus:VDQ (match_operand:VDQ 1 "s_register_operand" "w")
+ (match_operand:VDQ 2 "s_register_operand" "w"))))]
+ "TARGET_NEON"
+ "vabd.<V_s_elem> %<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+ (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+ (const_string "neon_fp_vadd_ddd_vabs_dd")
+ (const_string "neon_fp_vadd_qqq_vabs_qq"))
+ (const_string "neon_int_5")))]
+)
+
+(define_insn "neon_vabd<mode>_3"
+ [(set (match_operand:VDQ 0 "s_register_operand" "=w")
+ (abs:VDQ (unspec:VDQ [(match_operand:VDQ 1 "s_register_operand" "w")
+ (match_operand:VDQ 2 "s_register_operand" "w")]
+ UNSPEC_VSUB)))]
+ "TARGET_NEON"
+ "vabd.<V_if_elem> %<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+ (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+ (const_string "neon_fp_vadd_ddd_vabs_dd")
+ (const_string "neon_fp_vadd_qqq_vabs_qq"))
+ (const_string "neon_int_5")))]
+)
diff --git a/gcc/testsuite/gcc.target/arm/neon-combine-sub-abs-into-vabd.c b/gcc/testsuite/gcc.target/arm/neon-combine-sub-abs-into-vabd.c
new file mode 100644
index 0000000..aae4117
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-combine-sub-abs-into-vabd.c
@@ -0,0 +1,54 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O2 -funsafe-math-optimizations" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+float32x2_t f_sub_abs_to_vabd_32()
+{
+
+ float32x2_t val1 = vdup_n_f32 (10);
+ float32x2_t val2 = vdup_n_f32 (30);
+ float32x2_t sres = vsub_f32(val1, val2);
+ float32x2_t res = vabs_f32 (sres);
+
+ return res;
+}
+/* { dg-final { scan-assembler "vabd\.f32" } }*/
+
+#include <arm_neon.h>
+int8x8_t sub_abs_to_vabd_8()
+{
+
+ int8x8_t val1 = vdup_n_s8 (10);
+ int8x8_t val2 = vdup_n_s8 (30);
+ int8x8_t sres = vsub_s8(val1, val2);
+ int8x8_t res = vabs_s8 (sres);
+
+ return res;
+}
+/* { dg-final { scan-assembler "vabd\.s8" } }*/
+
+int16x4_t sub_abs_to_vabd_16()
+{
+
+ int16x4_t val1 = vdup_n_s16 (10);
+ int16x4_t val2 = vdup_n_s16 (30);
+ int16x4_t sres = vsub_s16(val1, val2);
+ int16x4_t res = vabs_s16 (sres);
+
+ return res;
+}
+/* { dg-final { scan-assembler "vabd\.s16" } }*/
+
+int32x2_t sub_abs_to_vabd_32()
+{
+
+ int32x2_t val1 = vdup_n_s32 (10);
+ int32x2_t val2 = vdup_n_s32 (30);
+ int32x2_t sres = vsub_s32(val1, val2);
+ int32x2_t res = vabs_s32 (sres);
+
+ return res;
+}
+/* { dg-final { scan-assembler "vabd\.s32" } }*/