Re: [PATCH] Transition nvptx backend to STORE_FLAG_VALUE = 1

2022-01-04 Thread Tom de Vries via Gcc-patches

On 10/5/21 19:48, Roger Sayle wrote:


This patch to the nvptx backend changes the backend's STORE_FLAG_VALUE
from -1 to 1, by using BImode predicates and selp instructions, instead
of set instructions (almost always followed by integer negation).

Historically, it was reasonable (through rare) for backends to use -1
for representing true during the RTL passes.  However with tree-ssa,
GCC now emits lots of code that reads and writes _Bool values, requiring
STORE_FLAG_VALUE=-1 targets to frequently convert 0/-1 pseudos to 0/1
pseudos using integer negation.  Unfortunately, this process prevents
or complicates many optimizations (negate isn't associative with logical
AND, OR and XOR, and interferes with range/vrp/nonzerobits bounds etc.).

The impact of this is that for a relatively simple logical expression
like "return (x==21) && (y==69);", the nvptx backend currently generates:

 mov.u32 %r26, %ar0;
 mov.u32 %r27, %ar1;
 set.u32.eq.u32  %r30, %r26, 21;
 neg.s32 %r31, %r30;
 mov.u32 %r29, %r31;
 set.u32.eq.u32  %r33, %r27, 69;
 neg.s32 %r34, %r33;
 mov.u32 %r32, %r34;
 cvt.u16.u8  %r39, %r29;
 mov.u16 %r36, %r39;
 cvt.u16.u8  %r39, %r32;
 mov.u16 %r37, %r39;
 and.b16 %r35, %r36, %r37;
 cvt.u32.u16 %r38, %r35;
 cvt.u32.u8  %value, %r38;

This patch tweaks nvptx to generate 0/1 values instead, requiring the
same number of instructions, using (BImode) predicate registers and selp
instructions so as to now generate the almost identical:

 mov.u32 %r26, %ar0;
 mov.u32 %r27, %ar1;
 setp.eq.u32 %r31, %r26, 21;
 selp.u32%r30, 1, 0, %r31;
 mov.u32 %r29, %r30;
 setp.eq.u32 %r34, %r27, 69;
 selp.u32%r33, 1, 0, %r34;
 mov.u32 %r32, %r33;
 cvt.u16.u8  %r39, %r29;
 mov.u16 %r36, %r39;
 cvt.u16.u8  %r39, %r32;
 mov.u16 %r37, %r39;
 and.b16 %r35, %r36, %r37;
 cvt.u32.u16 %r38, %r35;
 cvt.u32.u8  %value, %r38;

The hidden benefit is that this sequence can (in theory) be optimized
by the RTL passes to eventually generate a much shorter sequence using
an and.pred instruction (just like Nvidia's nvcc compiler).

This patch has been tested nvptx-none with a "make" and "make -k check"
(including newlib) hosted on x86_64-pc-linux-gnu with no new failures.
Ok for mainline?




Thanks for the patch, sounds reasonable.

Committed.

Thanks,
- Tom


2021-10-05  Roger Sayle  

gcc/ChangeLog
* config/nvptx/nvptx.h (STORE_FLAG_VALUE): Change to 1.
* config/nvptx/nvptx.md (movbi): Use P1 constraint for true.
(setcc_from_bi): Remove SImode specific pattern.
(setcc_from_bi): Provide more general HSDIM pattern.
(extendbi2, zeroextendbi2): Provide instructions
for sign- and zero-extending BImode predicates to integers.
(setcc_int): Remove previous (-1-based) instructions.
(cstorebi4): Remove BImode to SImode specific expander.
(cstore4): Fix indentation.  Expand using setccsi_from_bi.
(cstore4): For both integer and floating point modes.


Thanks in advance,
Roger
--



[PATCH] Transition nvptx backend to STORE_FLAG_VALUE = 1

2021-10-05 Thread Roger Sayle

This patch to the nvptx backend changes the backend's STORE_FLAG_VALUE
from -1 to 1, by using BImode predicates and selp instructions, instead
of set instructions (almost always followed by integer negation).

Historically, it was reasonable (through rare) for backends to use -1
for representing true during the RTL passes.  However with tree-ssa,
GCC now emits lots of code that reads and writes _Bool values, requiring
STORE_FLAG_VALUE=-1 targets to frequently convert 0/-1 pseudos to 0/1
pseudos using integer negation.  Unfortunately, this process prevents
or complicates many optimizations (negate isn't associative with logical
AND, OR and XOR, and interferes with range/vrp/nonzerobits bounds etc.).

The impact of this is that for a relatively simple logical expression
like "return (x==21) && (y==69);", the nvptx backend currently generates:

mov.u32 %r26, %ar0;
mov.u32 %r27, %ar1;
set.u32.eq.u32  %r30, %r26, 21;
neg.s32 %r31, %r30;
mov.u32 %r29, %r31;
set.u32.eq.u32  %r33, %r27, 69;
neg.s32 %r34, %r33;
mov.u32 %r32, %r34;
cvt.u16.u8  %r39, %r29;
mov.u16 %r36, %r39;
cvt.u16.u8  %r39, %r32;
mov.u16 %r37, %r39;
and.b16 %r35, %r36, %r37;
cvt.u32.u16 %r38, %r35;
cvt.u32.u8  %value, %r38;

This patch tweaks nvptx to generate 0/1 values instead, requiring the
same number of instructions, using (BImode) predicate registers and selp
instructions so as to now generate the almost identical:

mov.u32 %r26, %ar0;
mov.u32 %r27, %ar1;
setp.eq.u32 %r31, %r26, 21;
selp.u32%r30, 1, 0, %r31;
mov.u32 %r29, %r30;
setp.eq.u32 %r34, %r27, 69;
selp.u32%r33, 1, 0, %r34;
mov.u32 %r32, %r33;
cvt.u16.u8  %r39, %r29;
mov.u16 %r36, %r39;
cvt.u16.u8  %r39, %r32;
mov.u16 %r37, %r39;
and.b16 %r35, %r36, %r37;
cvt.u32.u16 %r38, %r35;
cvt.u32.u8  %value, %r38;

The hidden benefit is that this sequence can (in theory) be optimized
by the RTL passes to eventually generate a much shorter sequence using
an and.pred instruction (just like Nvidia's nvcc compiler).

This patch has been tested nvptx-none with a "make" and "make -k check"
(including newlib) hosted on x86_64-pc-linux-gnu with no new failures.
Ok for mainline?


2021-10-05  Roger Sayle  

gcc/ChangeLog
* config/nvptx/nvptx.h (STORE_FLAG_VALUE): Change to 1.
* config/nvptx/nvptx.md (movbi): Use P1 constraint for true.
(setcc_from_bi): Remove SImode specific pattern.
(setcc_from_bi): Provide more general HSDIM pattern.
(extendbi2, zeroextendbi2): Provide instructions
for sign- and zero-extending BImode predicates to integers.
(setcc_int): Remove previous (-1-based) instructions.
(cstorebi4): Remove BImode to SImode specific expander.
(cstore4): Fix indentation.  Expand using setccsi_from_bi.
(cstore4): For both integer and floating point modes.


Thanks in advance,
Roger
--

diff --git a/gcc/config/nvptx/nvptx.h b/gcc/config/nvptx/nvptx.h
index d367174..fcebcf9 100644
--- a/gcc/config/nvptx/nvptx.h
+++ b/gcc/config/nvptx/nvptx.h
@@ -315,7 +315,7 @@ struct GTY(()) machine_function
 #define NO_DOT_IN_LABEL
 #define ASM_COMMENT_START "//"
 
-#define STORE_FLAG_VALUE -1
+#define STORE_FLAG_VALUE 1
 #define FLOAT_STORE_FLAG_VALUE(MODE) REAL_VALUE_ATOF("1.0", (MODE))
 
 #define CASE_VECTOR_MODE SImode
diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index 108de1c..b3275b1 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -213,7 +213,7 @@
 ;; get variables in this mode and pseudos are never spilled.
 (define_insn "movbi"
   [(set (match_operand:BI 0 "nvptx_register_operand" "=R,R,R")
-   (match_operand:BI 1 "nvptx_nonmemory_operand" "R,P0,Pn"))]
+   (match_operand:BI 1 "nvptx_nonmemory_operand" "R,P0,P1"))]
   ""
   "@
%.\\tmov%t0\\t%0, %1;
@@ -789,12 +789,26 @@
 
 ;; Conditional stores
 
-(define_insn "setcc_from_bi"
-  [(set (match_operand:SI 0 "nvptx_register_operand" "=R")
-   (ne:SI (match_operand:BI 1 "nvptx_register_operand" "R")
-  (const_int 0)))]
+(define_insn "setcc_from_bi"
+  [(set (match_operand:HSDIM 0 "nvptx_register_operand" "=R")
+   (ne:HSDIM (match_operand:BI 1 "nvptx_register_operand" "R")
+  (const_int 0)))]
+  ""
+  "%.\\tselp%t0\\t%0, 1, 0, %1;")
+
+(define_insn "extendbi2"
+  [(set (match_operand:HSDIM 0 "nvptx_register_operand" "=R")
+   (sign_extend:HSDIM
+(match_operand:BI 1 "nvptx_register_operand" "R")))]
+  ""
+  "%.\\tselp%t0\\t%0, -1, 0, %1;")
+
+(define_i