https://github.com/Bryce-MW updated https://github.com/llvm/llvm-project/pull/77964
>From d4c312b9dbf447d0a53dda0e6cdc482bd908430b Mon Sep 17 00:00:00 2001 From: Bryce Wilson <bryce.wil...@oldmissioncapital.com> Date: Fri, 12 Jan 2024 16:01:32 -0600 Subject: [PATCH 01/16] [X86] Use RORX over SHR imm --- llvm/lib/Target/X86/X86InstrShiftRotate.td | 78 ++++++++++++++ llvm/test/CodeGen/X86/atomic-unordered.ll | 3 +- llvm/test/CodeGen/X86/bmi2.ll | 6 +- llvm/test/CodeGen/X86/cmp-shiftX-maskX.ll | 3 +- llvm/test/CodeGen/X86/pr35636.ll | 4 +- llvm/test/CodeGen/X86/vector-trunc-ssat.ll | 116 ++++++++++----------- 6 files changed, 143 insertions(+), 67 deletions(-) diff --git a/llvm/lib/Target/X86/X86InstrShiftRotate.td b/llvm/lib/Target/X86/X86InstrShiftRotate.td index f951894db1890..238e8e9b6e97f 100644 --- a/llvm/lib/Target/X86/X86InstrShiftRotate.td +++ b/llvm/lib/Target/X86/X86InstrShiftRotate.td @@ -879,6 +879,26 @@ let Predicates = [HasBMI2, HasEGPR, In64BitMode] in { defm SHLX64 : bmi_shift<"shlx{q}", GR64, i64mem, "_EVEX">, T8, PD, REX_W, EVEX; } + +def immle16_8 : ImmLeaf<i8, [{ + return Imm <= 16 - 8; +}]>; +def immle32_8 : ImmLeaf<i8, [{ + return Imm <= 32 - 8; +}]>; +def immle64_8 : ImmLeaf<i8, [{ + return Imm <= 64 - 8; +}]>; +def immle32_16 : ImmLeaf<i8, [{ + return Imm <= 32 - 16; +}]>; +def immle64_16 : ImmLeaf<i8, [{ + return Imm <= 64 - 16; +}]>; +def immle64_32 : ImmLeaf<i8, [{ + return Imm <= 64 - 32; +}]>; + let Predicates = [HasBMI2] in { // Prefer RORX which is non-destructive and doesn't update EFLAGS. let AddedComplexity = 10 in { @@ -891,6 +911,64 @@ let Predicates = [HasBMI2] in { (RORX32ri GR32:$src, (ROT32L2R_imm8 imm:$shamt))>; def : Pat<(rotl GR64:$src, (i8 imm:$shamt)), (RORX64ri GR64:$src, (ROT64L2R_imm8 imm:$shamt))>; + + // A right shift by less than a smaller register size that is then + // truncated to that register size can be replaced by RORX to + // preserve flags with the same execution cost + + def : Pat<(i8 (trunc (srl GR16:$src, (i8 immle16_8:$shamt)))), + (EXTRACT_SUBREG (RORX32ri (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR16:$src, sub_16bit), imm:$shamt), sub_8bit)>; + def : Pat<(i8 (trunc (sra GR16:$src, (i8 immle16_8:$shamt)))), + (EXTRACT_SUBREG (RORX32ri (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR16:$src, sub_16bit), imm:$shamt), sub_8bit)>; + def : Pat<(i8 (trunc (srl GR32:$src, (i8 immle32_8:$shamt)))), + (EXTRACT_SUBREG (RORX32ri GR32:$src, imm:$shamt), sub_8bit)>; + def : Pat<(i8 (trunc (sra GR32:$src, (i8 immle32_8:$shamt)))), + (EXTRACT_SUBREG (RORX32ri GR32:$src, imm:$shamt), sub_8bit)>; + def : Pat<(i8 (trunc (srl GR64:$src, (i8 immle64_8:$shamt)))), + (EXTRACT_SUBREG (RORX64ri GR64:$src, imm:$shamt), sub_8bit)>; + def : Pat<(i8 (trunc (sra GR64:$src, (i8 immle64_8:$shamt)))), + (EXTRACT_SUBREG (RORX64ri GR64:$src, imm:$shamt), sub_8bit)>; + + + def : Pat<(i16 (trunc (srl GR32:$src, (i8 immle32_16:$shamt)))), + (EXTRACT_SUBREG (RORX32ri GR32:$src, imm:$shamt), sub_16bit)>; + def : Pat<(i16 (trunc (sra GR32:$src, (i8 immle32_16:$shamt)))), + (EXTRACT_SUBREG (RORX32ri GR32:$src, imm:$shamt), sub_16bit)>; + def : Pat<(i16 (trunc (srl GR64:$src, (i8 immle64_16:$shamt)))), + (EXTRACT_SUBREG (RORX64ri GR64:$src, imm:$shamt), sub_16bit)>; + def : Pat<(i16 (trunc (sra GR64:$src, (i8 immle64_16:$shamt)))), + (EXTRACT_SUBREG (RORX64ri GR64:$src, imm:$shamt), sub_16bit)>; + + def : Pat<(i32 (trunc (srl GR64:$src, (i8 immle64_32:$shamt)))), + (EXTRACT_SUBREG (RORX64ri GR64:$src, imm:$shamt), sub_32bit)>; + def : Pat<(i32 (trunc (sra GR64:$src, (i8 immle64_32:$shamt)))), + (EXTRACT_SUBREG (RORX64ri GR64:$src, imm:$shamt), sub_32bit)>; + + + // Can't expand the load + def : Pat<(i8 (trunc (srl (loadi32 addr:$src), (i8 immle32_8:$shamt)))), + (EXTRACT_SUBREG (RORX32mi addr:$src, imm:$shamt), sub_8bit)>; + def : Pat<(i8 (trunc (sra (loadi32 addr:$src), (i8 immle32_8:$shamt)))), + (EXTRACT_SUBREG (RORX32mi addr:$src, imm:$shamt), sub_8bit)>; + def : Pat<(i8 (trunc (srl (loadi64 addr:$src), (i8 immle64_8:$shamt)))), + (EXTRACT_SUBREG (RORX64mi addr:$src, imm:$shamt), sub_8bit)>; + def : Pat<(i8 (trunc (sra (loadi64 addr:$src), (i8 immle64_8:$shamt)))), + (EXTRACT_SUBREG (RORX64mi addr:$src, imm:$shamt), sub_8bit)>; + + + def : Pat<(i16 (trunc (srl (loadi32 addr:$src), (i8 immle32_16:$shamt)))), + (EXTRACT_SUBREG (RORX32mi addr:$src, imm:$shamt), sub_16bit)>; + def : Pat<(i16 (trunc (sra (loadi32 addr:$src), (i8 immle32_16:$shamt)))), + (EXTRACT_SUBREG (RORX32mi addr:$src, imm:$shamt), sub_16bit)>; + def : Pat<(i16 (trunc (srl (loadi64 addr:$src), (i8 immle64_16:$shamt)))), + (EXTRACT_SUBREG (RORX64mi addr:$src, imm:$shamt), sub_16bit)>; + def : Pat<(i16 (trunc (sra (loadi64 addr:$src), (i8 immle64_16:$shamt)))), + (EXTRACT_SUBREG (RORX64mi addr:$src, imm:$shamt), sub_16bit)>; + + def : Pat<(i32 (trunc (srl (loadi64 addr:$src), (i8 immle64_32:$shamt)))), + (EXTRACT_SUBREG (RORX64mi addr:$src, imm:$shamt), sub_32bit)>; + def : Pat<(i32 (trunc (sra (loadi64 addr:$src), (i8 immle64_32:$shamt)))), + (EXTRACT_SUBREG (RORX64mi addr:$src, imm:$shamt), sub_32bit)>; } def : Pat<(rotr (loadi32 addr:$src), (i8 imm:$shamt)), diff --git a/llvm/test/CodeGen/X86/atomic-unordered.ll b/llvm/test/CodeGen/X86/atomic-unordered.ll index df123be53474f..c867817fd2dff 100644 --- a/llvm/test/CodeGen/X86/atomic-unordered.ll +++ b/llvm/test/CodeGen/X86/atomic-unordered.ll @@ -2062,8 +2062,7 @@ define i32 @split_load(ptr %p) { ; CHECK-O3-LABEL: split_load: ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: movq (%rdi), %rax -; CHECK-O3-NEXT: movq %rax, %rcx -; CHECK-O3-NEXT: shrq $32, %rcx +; CHECK-O3-NEXT: rorxq $32, %rax, %rcx ; CHECK-O3-NEXT: orl %eax, %ecx ; CHECK-O3-NEXT: movzbl %cl, %eax ; CHECK-O3-NEXT: retq diff --git a/llvm/test/CodeGen/X86/bmi2.ll b/llvm/test/CodeGen/X86/bmi2.ll index 24e38cfeb704d..e81434b35096a 100644 --- a/llvm/test/CodeGen/X86/bmi2.ll +++ b/llvm/test/CodeGen/X86/bmi2.ll @@ -310,8 +310,7 @@ define i32 @mulx32(i32 %x, i32 %y, ptr %p) { ; X64-NEXT: addl %edi, %edi ; X64-NEXT: leal (%rsi,%rsi), %eax ; X64-NEXT: imulq %rdi, %rax -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: shrq $32, %rcx +; X64-NEXT: rorxq $32, %rax, %rcx ; X64-NEXT: movl %ecx, (%rdx) ; X64-NEXT: # kill: def $eax killed $eax killed $rax ; X64-NEXT: retq @@ -344,8 +343,7 @@ define i32 @mulx32_load(i32 %x, ptr %y, ptr %p) { ; X64-NEXT: leal (%rdi,%rdi), %eax ; X64-NEXT: movl (%rsi), %ecx ; X64-NEXT: imulq %rcx, %rax -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: shrq $32, %rcx +; X64-NEXT: rorxq $32, %rax, %rcx ; X64-NEXT: movl %ecx, (%rdx) ; X64-NEXT: # kill: def $eax killed $eax killed $rax ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/cmp-shiftX-maskX.ll b/llvm/test/CodeGen/X86/cmp-shiftX-maskX.ll index 7996454a0158e..a935bca3161b0 100644 --- a/llvm/test/CodeGen/X86/cmp-shiftX-maskX.ll +++ b/llvm/test/CodeGen/X86/cmp-shiftX-maskX.ll @@ -157,7 +157,8 @@ define i1 @shr_to_shl_eq_i16_s1_fail(i16 %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: movzwl %di, %eax ; CHECK-NEXT: andl $32766, %edi # imm = 0x7FFE -; CHECK-NEXT: shrl %eax +; CHECK-NOBMI-NEXT: shrl %eax +; CHECK-BMI2-NEXT: rorxl $1, %eax, %eax ; CHECK-NEXT: cmpw %ax, %di ; CHECK-NEXT: sete %al ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/pr35636.ll b/llvm/test/CodeGen/X86/pr35636.ll index 0b7d64f38c780..8b8a25eb5632c 100644 --- a/llvm/test/CodeGen/X86/pr35636.ll +++ b/llvm/test/CodeGen/X86/pr35636.ll @@ -10,7 +10,7 @@ define void @_Z15uint64_to_asciimPc(i64 %arg) { ; HSW-NEXT: mulxq %rax, %rax, %rax ; HSW-NEXT: shrq $42, %rax ; HSW-NEXT: imulq $281474977, %rax, %rax # imm = 0x10C6F7A1 -; HSW-NEXT: shrq $20, %rax +; HSW-NEXT: rorxq $20, %rax, %rax ; HSW-NEXT: leal (%rax,%rax,4), %eax ; HSW-NEXT: addl $5, %eax ; HSW-NEXT: andl $134217727, %eax # imm = 0x7FFFFFF @@ -27,7 +27,7 @@ define void @_Z15uint64_to_asciimPc(i64 %arg) { ; ZN-NEXT: mulxq %rax, %rax, %rax ; ZN-NEXT: shrq $42, %rax ; ZN-NEXT: imulq $281474977, %rax, %rax # imm = 0x10C6F7A1 -; ZN-NEXT: shrq $20, %rax +; ZN-NEXT: rorxq $20, %rax, %rax ; ZN-NEXT: leal 5(%rax,%rax,4), %eax ; ZN-NEXT: andl $134217727, %eax # imm = 0x7FFFFFF ; ZN-NEXT: leal (%rax,%rax,4), %eax diff --git a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll index 14f724fc3b8c7..2837be16b6b2b 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll @@ -4843,72 +4843,72 @@ define void @trunc_ssat_v16i32_v16i24(<16 x i32> %x, ptr %y) nounwind { ; SKX-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; SKX-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; SKX-NEXT: vextracti32x4 $3, %zmm0, %xmm1 -; SKX-NEXT: vpextrd $3, %xmm1, %r15d -; SKX-NEXT: movw %r15w, 45(%rdi) -; SKX-NEXT: vpextrd $2, %xmm1, %r14d -; SKX-NEXT: movw %r14w, 42(%rdi) -; SKX-NEXT: vpextrd $1, %xmm1, %ebp -; SKX-NEXT: movw %bp, 39(%rdi) +; SKX-NEXT: vpextrd $3, %xmm1, %r8d +; SKX-NEXT: movw %r8w, 45(%rdi) +; SKX-NEXT: vpextrd $2, %xmm1, %r9d +; SKX-NEXT: movw %r9w, 42(%rdi) +; SKX-NEXT: vpextrd $1, %xmm1, %r10d +; SKX-NEXT: movw %r10w, 39(%rdi) ; SKX-NEXT: vmovd %xmm1, %r11d ; SKX-NEXT: movw %r11w, 36(%rdi) ; SKX-NEXT: vextracti32x4 $2, %zmm0, %xmm1 ; SKX-NEXT: vpextrd $3, %xmm1, %ebx ; SKX-NEXT: movw %bx, 33(%rdi) -; SKX-NEXT: vpextrd $2, %xmm1, %r10d -; SKX-NEXT: movw %r10w, 30(%rdi) -; SKX-NEXT: vpextrd $1, %xmm1, %r9d -; SKX-NEXT: movw %r9w, 27(%rdi) -; SKX-NEXT: vmovd %xmm1, %r8d -; SKX-NEXT: vpextrd $3, %xmm0, %edx -; SKX-NEXT: movw %r8w, 24(%rdi) -; SKX-NEXT: movw %dx, 9(%rdi) -; SKX-NEXT: vpextrd $2, %xmm0, %esi -; SKX-NEXT: vpextrd $1, %xmm0, %eax -; SKX-NEXT: movw %si, 6(%rdi) -; SKX-NEXT: movw %ax, 3(%rdi) -; SKX-NEXT: vmovd %xmm0, %ecx -; SKX-NEXT: movw %cx, (%rdi) -; SKX-NEXT: shrl $16, %r15d -; SKX-NEXT: movb %r15b, 47(%rdi) -; SKX-NEXT: shrl $16, %r14d -; SKX-NEXT: movb %r14b, 44(%rdi) -; SKX-NEXT: shrl $16, %ebp -; SKX-NEXT: movb %bpl, 41(%rdi) -; SKX-NEXT: shrl $16, %r11d -; SKX-NEXT: movb %r11b, 38(%rdi) -; SKX-NEXT: shrl $16, %ebx -; SKX-NEXT: movb %bl, 35(%rdi) -; SKX-NEXT: shrl $16, %r10d -; SKX-NEXT: movb %r10b, 32(%rdi) -; SKX-NEXT: shrl $16, %r9d -; SKX-NEXT: movb %r9b, 29(%rdi) -; SKX-NEXT: shrl $16, %r8d +; SKX-NEXT: vpextrd $2, %xmm1, %ebp +; SKX-NEXT: movw %bp, 30(%rdi) +; SKX-NEXT: vpextrd $1, %xmm1, %r14d +; SKX-NEXT: movw %r14w, 27(%rdi) +; SKX-NEXT: vmovd %xmm1, %r15d +; SKX-NEXT: vpextrd $3, %xmm0, %eax +; SKX-NEXT: movw %r15w, 24(%rdi) +; SKX-NEXT: movw %ax, 9(%rdi) +; SKX-NEXT: vpextrd $2, %xmm0, %ecx +; SKX-NEXT: vpextrd $1, %xmm0, %edx +; SKX-NEXT: movw %cx, 6(%rdi) +; SKX-NEXT: movw %dx, 3(%rdi) +; SKX-NEXT: vmovd %xmm0, %esi +; SKX-NEXT: movw %si, (%rdi) +; SKX-NEXT: rorxl $16, %r8d, %r8d +; SKX-NEXT: movb %r8b, 47(%rdi) +; SKX-NEXT: rorxl $16, %r9d, %r8d +; SKX-NEXT: movb %r8b, 44(%rdi) +; SKX-NEXT: rorxl $16, %r10d, %r8d +; SKX-NEXT: movb %r8b, 41(%rdi) +; SKX-NEXT: rorxl $16, %r11d, %r8d +; SKX-NEXT: movb %r8b, 38(%rdi) +; SKX-NEXT: rorxl $16, %ebx, %r8d +; SKX-NEXT: movb %r8b, 35(%rdi) +; SKX-NEXT: rorxl $16, %ebp, %r8d +; SKX-NEXT: movb %r8b, 32(%rdi) +; SKX-NEXT: rorxl $16, %r14d, %r8d +; SKX-NEXT: movb %r8b, 29(%rdi) +; SKX-NEXT: rorxl $16, %r15d, %r8d ; SKX-NEXT: movb %r8b, 26(%rdi) ; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0 -; SKX-NEXT: vpextrd $3, %xmm0, %r11d -; SKX-NEXT: movw %r11w, 21(%rdi) -; SKX-NEXT: vpextrd $2, %xmm0, %r10d -; SKX-NEXT: movw %r10w, 18(%rdi) -; SKX-NEXT: vpextrd $1, %xmm0, %r9d -; SKX-NEXT: movw %r9w, 15(%rdi) -; SKX-NEXT: vmovd %xmm0, %r8d -; SKX-NEXT: movw %r8w, 12(%rdi) -; SKX-NEXT: shrl $16, %edx -; SKX-NEXT: movb %dl, 11(%rdi) -; SKX-NEXT: shrl $16, %esi -; SKX-NEXT: movb %sil, 8(%rdi) -; SKX-NEXT: shrl $16, %eax +; SKX-NEXT: vpextrd $3, %xmm0, %r8d +; SKX-NEXT: movw %r8w, 21(%rdi) +; SKX-NEXT: vpextrd $2, %xmm0, %r9d +; SKX-NEXT: movw %r9w, 18(%rdi) +; SKX-NEXT: vpextrd $1, %xmm0, %r10d +; SKX-NEXT: movw %r10w, 15(%rdi) +; SKX-NEXT: vmovd %xmm0, %r11d +; SKX-NEXT: movw %r11w, 12(%rdi) +; SKX-NEXT: rorxl $16, %eax, %eax +; SKX-NEXT: movb %al, 11(%rdi) +; SKX-NEXT: rorxl $16, %ecx, %eax +; SKX-NEXT: movb %al, 8(%rdi) +; SKX-NEXT: rorxl $16, %edx, %eax ; SKX-NEXT: movb %al, 5(%rdi) -; SKX-NEXT: shrl $16, %ecx -; SKX-NEXT: movb %cl, 2(%rdi) -; SKX-NEXT: shrl $16, %r11d -; SKX-NEXT: movb %r11b, 23(%rdi) -; SKX-NEXT: shrl $16, %r10d -; SKX-NEXT: movb %r10b, 20(%rdi) -; SKX-NEXT: shrl $16, %r9d -; SKX-NEXT: movb %r9b, 17(%rdi) -; SKX-NEXT: shrl $16, %r8d -; SKX-NEXT: movb %r8b, 14(%rdi) +; SKX-NEXT: rorxl $16, %esi, %eax +; SKX-NEXT: movb %al, 2(%rdi) +; SKX-NEXT: rorxl $16, %r8d, %eax +; SKX-NEXT: movb %al, 23(%rdi) +; SKX-NEXT: rorxl $16, %r9d, %eax +; SKX-NEXT: movb %al, 20(%rdi) +; SKX-NEXT: rorxl $16, %r10d, %eax +; SKX-NEXT: movb %al, 17(%rdi) +; SKX-NEXT: rorxl $16, %r11d, %eax +; SKX-NEXT: movb %al, 14(%rdi) ; SKX-NEXT: popq %rbx ; SKX-NEXT: popq %r14 ; SKX-NEXT: popq %r15 >From 06342a51310156ce02296f81c44d42fc351ac4a4 Mon Sep 17 00:00:00 2001 From: Bryce Wilson <bryce.wil...@oldmissioncapital.com> Date: Fri, 12 Jan 2024 16:22:16 -0600 Subject: [PATCH 02/16] Change imm names --- llvm/lib/Target/X86/X86InstrShiftRotate.td | 56 +++++++++++----------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/llvm/lib/Target/X86/X86InstrShiftRotate.td b/llvm/lib/Target/X86/X86InstrShiftRotate.td index 238e8e9b6e97f..e294182a247ad 100644 --- a/llvm/lib/Target/X86/X86InstrShiftRotate.td +++ b/llvm/lib/Target/X86/X86InstrShiftRotate.td @@ -880,22 +880,22 @@ let Predicates = [HasBMI2, HasEGPR, In64BitMode] in { } -def immle16_8 : ImmLeaf<i8, [{ +def immle16minus8 : ImmLeaf<i8, [{ return Imm <= 16 - 8; }]>; -def immle32_8 : ImmLeaf<i8, [{ +def immle32minus8 : ImmLeaf<i8, [{ return Imm <= 32 - 8; }]>; -def immle64_8 : ImmLeaf<i8, [{ +def immle64minus8 : ImmLeaf<i8, [{ return Imm <= 64 - 8; }]>; -def immle32_16 : ImmLeaf<i8, [{ +def immle32minus16 : ImmLeaf<i8, [{ return Imm <= 32 - 16; }]>; -def immle64_16 : ImmLeaf<i8, [{ +def immle64minus16 : ImmLeaf<i8, [{ return Imm <= 64 - 16; }]>; -def immle64_32 : ImmLeaf<i8, [{ +def immle64minus32 : ImmLeaf<i8, [{ return Imm <= 64 - 32; }]>; @@ -916,58 +916,58 @@ let Predicates = [HasBMI2] in { // truncated to that register size can be replaced by RORX to // preserve flags with the same execution cost - def : Pat<(i8 (trunc (srl GR16:$src, (i8 immle16_8:$shamt)))), + def : Pat<(i8 (trunc (srl GR16:$src, (i8 immle16minus8:$shamt)))), (EXTRACT_SUBREG (RORX32ri (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR16:$src, sub_16bit), imm:$shamt), sub_8bit)>; - def : Pat<(i8 (trunc (sra GR16:$src, (i8 immle16_8:$shamt)))), + def : Pat<(i8 (trunc (sra GR16:$src, (i8 immle16minus8:$shamt)))), (EXTRACT_SUBREG (RORX32ri (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR16:$src, sub_16bit), imm:$shamt), sub_8bit)>; - def : Pat<(i8 (trunc (srl GR32:$src, (i8 immle32_8:$shamt)))), + def : Pat<(i8 (trunc (srl GR32:$src, (i8 immle32minus8:$shamt)))), (EXTRACT_SUBREG (RORX32ri GR32:$src, imm:$shamt), sub_8bit)>; - def : Pat<(i8 (trunc (sra GR32:$src, (i8 immle32_8:$shamt)))), + def : Pat<(i8 (trunc (sra GR32:$src, (i8 immle32minus8:$shamt)))), (EXTRACT_SUBREG (RORX32ri GR32:$src, imm:$shamt), sub_8bit)>; - def : Pat<(i8 (trunc (srl GR64:$src, (i8 immle64_8:$shamt)))), + def : Pat<(i8 (trunc (srl GR64:$src, (i8 immle64minus8:$shamt)))), (EXTRACT_SUBREG (RORX64ri GR64:$src, imm:$shamt), sub_8bit)>; - def : Pat<(i8 (trunc (sra GR64:$src, (i8 immle64_8:$shamt)))), + def : Pat<(i8 (trunc (sra GR64:$src, (i8 immle64minus8:$shamt)))), (EXTRACT_SUBREG (RORX64ri GR64:$src, imm:$shamt), sub_8bit)>; - def : Pat<(i16 (trunc (srl GR32:$src, (i8 immle32_16:$shamt)))), + def : Pat<(i16 (trunc (srl GR32:$src, (i8 immle32minus16:$shamt)))), (EXTRACT_SUBREG (RORX32ri GR32:$src, imm:$shamt), sub_16bit)>; - def : Pat<(i16 (trunc (sra GR32:$src, (i8 immle32_16:$shamt)))), + def : Pat<(i16 (trunc (sra GR32:$src, (i8 immle32minus16:$shamt)))), (EXTRACT_SUBREG (RORX32ri GR32:$src, imm:$shamt), sub_16bit)>; - def : Pat<(i16 (trunc (srl GR64:$src, (i8 immle64_16:$shamt)))), + def : Pat<(i16 (trunc (srl GR64:$src, (i8 immle64minus16:$shamt)))), (EXTRACT_SUBREG (RORX64ri GR64:$src, imm:$shamt), sub_16bit)>; - def : Pat<(i16 (trunc (sra GR64:$src, (i8 immle64_16:$shamt)))), + def : Pat<(i16 (trunc (sra GR64:$src, (i8 immle64minus16:$shamt)))), (EXTRACT_SUBREG (RORX64ri GR64:$src, imm:$shamt), sub_16bit)>; - def : Pat<(i32 (trunc (srl GR64:$src, (i8 immle64_32:$shamt)))), + def : Pat<(i32 (trunc (srl GR64:$src, (i8 immle64minus32:$shamt)))), (EXTRACT_SUBREG (RORX64ri GR64:$src, imm:$shamt), sub_32bit)>; - def : Pat<(i32 (trunc (sra GR64:$src, (i8 immle64_32:$shamt)))), + def : Pat<(i32 (trunc (sra GR64:$src, (i8 immle64minus32:$shamt)))), (EXTRACT_SUBREG (RORX64ri GR64:$src, imm:$shamt), sub_32bit)>; // Can't expand the load - def : Pat<(i8 (trunc (srl (loadi32 addr:$src), (i8 immle32_8:$shamt)))), + def : Pat<(i8 (trunc (srl (loadi32 addr:$src), (i8 immle32minus8:$shamt)))), (EXTRACT_SUBREG (RORX32mi addr:$src, imm:$shamt), sub_8bit)>; - def : Pat<(i8 (trunc (sra (loadi32 addr:$src), (i8 immle32_8:$shamt)))), + def : Pat<(i8 (trunc (sra (loadi32 addr:$src), (i8 immle32minus8:$shamt)))), (EXTRACT_SUBREG (RORX32mi addr:$src, imm:$shamt), sub_8bit)>; - def : Pat<(i8 (trunc (srl (loadi64 addr:$src), (i8 immle64_8:$shamt)))), + def : Pat<(i8 (trunc (srl (loadi64 addr:$src), (i8 immle64minus8:$shamt)))), (EXTRACT_SUBREG (RORX64mi addr:$src, imm:$shamt), sub_8bit)>; - def : Pat<(i8 (trunc (sra (loadi64 addr:$src), (i8 immle64_8:$shamt)))), + def : Pat<(i8 (trunc (sra (loadi64 addr:$src), (i8 immle64minus8:$shamt)))), (EXTRACT_SUBREG (RORX64mi addr:$src, imm:$shamt), sub_8bit)>; - def : Pat<(i16 (trunc (srl (loadi32 addr:$src), (i8 immle32_16:$shamt)))), + def : Pat<(i16 (trunc (srl (loadi32 addr:$src), (i8 immle32minus16:$shamt)))), (EXTRACT_SUBREG (RORX32mi addr:$src, imm:$shamt), sub_16bit)>; - def : Pat<(i16 (trunc (sra (loadi32 addr:$src), (i8 immle32_16:$shamt)))), + def : Pat<(i16 (trunc (sra (loadi32 addr:$src), (i8 immle32minus16:$shamt)))), (EXTRACT_SUBREG (RORX32mi addr:$src, imm:$shamt), sub_16bit)>; - def : Pat<(i16 (trunc (srl (loadi64 addr:$src), (i8 immle64_16:$shamt)))), + def : Pat<(i16 (trunc (srl (loadi64 addr:$src), (i8 immle64minus16:$shamt)))), (EXTRACT_SUBREG (RORX64mi addr:$src, imm:$shamt), sub_16bit)>; - def : Pat<(i16 (trunc (sra (loadi64 addr:$src), (i8 immle64_16:$shamt)))), + def : Pat<(i16 (trunc (sra (loadi64 addr:$src), (i8 immle64minus16:$shamt)))), (EXTRACT_SUBREG (RORX64mi addr:$src, imm:$shamt), sub_16bit)>; - def : Pat<(i32 (trunc (srl (loadi64 addr:$src), (i8 immle64_32:$shamt)))), + def : Pat<(i32 (trunc (srl (loadi64 addr:$src), (i8 immle64minus32:$shamt)))), (EXTRACT_SUBREG (RORX64mi addr:$src, imm:$shamt), sub_32bit)>; - def : Pat<(i32 (trunc (sra (loadi64 addr:$src), (i8 immle64_32:$shamt)))), + def : Pat<(i32 (trunc (sra (loadi64 addr:$src), (i8 immle64minus32:$shamt)))), (EXTRACT_SUBREG (RORX64mi addr:$src, imm:$shamt), sub_32bit)>; } >From 777e5e2393f1c99865b1e0a4d1a734b11e8e00fb Mon Sep 17 00:00:00 2001 From: Bryce Wilson <bryce.wil...@oldmissioncapital.com> Date: Fri, 12 Jan 2024 16:56:37 -0600 Subject: [PATCH 03/16] Add test --- llvm/test/CodeGen/X86/pr77964.ll | 90 ++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 llvm/test/CodeGen/X86/pr77964.ll diff --git a/llvm/test/CodeGen/X86/pr77964.ll b/llvm/test/CodeGen/X86/pr77964.ll new file mode 100644 index 0000000000000..4946d15fd8077 --- /dev/null +++ b/llvm/test/CodeGen/X86/pr77964.ll @@ -0,0 +1,90 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=NOBMI +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi2 | FileCheck %s --check-prefix=BMI + +declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) +declare { i16, i1 } @llvm.uadd.with.overflow.i16(i16, i16) + +define zeroext i16 @checksum(ptr %0) { +; NOBMI-LABEL: checksum: +; NOBMI: # %bb.0: +; NOBMI-NEXT: movl (%rdi), %ecx +; NOBMI-NEXT: addl 4(%rdi), %ecx +; NOBMI-NEXT: adcl 8(%rdi), %ecx +; NOBMI-NEXT: adcl 12(%rdi), %ecx +; NOBMI-NEXT: adcl 16(%rdi), %ecx +; NOBMI-NEXT: setb %dl +; NOBMI-NEXT: movl %ecx, %eax +; NOBMI-NEXT: shrl $16, %eax +; NOBMI-NEXT: addb $255, %dl +; NOBMI-NEXT: adcw %cx, %ax +; NOBMI-NEXT: adcw $0, %ax +; NOBMI-NEXT: notl %eax +; NOBMI-NEXT: # kill: def $ax killed $ax killed $eax +; NOBMI-NEXT: retq +; +; BMI-LABEL: checksum: +; BMI: # %bb.0: +; BMI-NEXT: movl (%rdi), %ecx +; BMI-NEXT: addl 4(%rdi), %ecx +; BMI-NEXT: adcl 8(%rdi), %ecx +; BMI-NEXT: adcl 12(%rdi), %ecx +; BMI-NEXT: adcl 16(%rdi), %ecx +; BMI-NEXT: rorxl $16, %ecx, %eax +; BMI-NEXT: adcw %cx, %ax +; BMI-NEXT: adcw $0, %ax +; BMI-NEXT: notl %eax +; BMI-NEXT: # kill: def $ax killed $ax killed $eax +; BMI-NEXT: retq + %2 = load i32, ptr %0 + %3 = getelementptr inbounds i32, ptr %0, i64 1 + %4 = load i32, ptr %3 + %5 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %2, i32 %4) + %6 = extractvalue { i32, i1 } %5, 1 + %7 = extractvalue { i32, i1 } %5, 0 + %8 = getelementptr inbounds i32, ptr %0, i64 2 + %9 = load i32, ptr %8 + %10 = zext i1 %6 to i32 + %11 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %7, i32 %9) + %12 = extractvalue { i32, i1 } %11, 1 + %13 = extractvalue { i32, i1 } %11, 0 + %14 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %13, i32 %10) + %15 = extractvalue { i32, i1 } %14, 1 + %16 = extractvalue { i32, i1 } %14, 0 + %17 = or i1 %12, %15 + %18 = getelementptr inbounds i32, ptr %0, i64 3 + %19 = load i32, ptr %18 + %20 = zext i1 %17 to i32 + %21 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %16, i32 %19) + %22 = extractvalue { i32, i1 } %21, 1 + %23 = extractvalue { i32, i1 } %21, 0 + %24 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %23, i32 %20) + %25 = extractvalue { i32, i1 } %24, 1 + %26 = extractvalue { i32, i1 } %24, 0 + %27 = or i1 %22, %25 + %28 = getelementptr inbounds i32, ptr %0, i64 4 + %29 = load i32, ptr %28 + %30 = zext i1 %27 to i32 + %31 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %26, i32 %29) + %32 = extractvalue { i32, i1 } %31, 1 + %33 = extractvalue { i32, i1 } %31, 0 + %34 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %33, i32 %30) + %35 = extractvalue { i32, i1 } %34, 1 + %36 = extractvalue { i32, i1 } %34, 0 + %37 = or i1 %32, %35 + %38 = zext i1 %37 to i16 + %39 = trunc i32 %36 to i16 + %40 = lshr i32 %36, 16 + %41 = trunc i32 %40 to i16 + %42 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 %39, i16 %41) + %43 = extractvalue { i16, i1 } %42, 1 + %44 = extractvalue { i16, i1 } %42, 0 + %45 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 %44, i16 %38) + %46 = extractvalue { i16, i1 } %45, 1 + %47 = extractvalue { i16, i1 } %45, 0 + %48 = or i1 %43, %46 + %49 = zext i1 %48 to i16 + %50 = add i16 %47, %49 + %51 = xor i16 %50, -1 + ret i16 %51 +} >From bef5eaa2518594bc44bd2757d5e4d9527f5e7b8c Mon Sep 17 00:00:00 2001 From: Bryce Wilson <br...@brycemw.ca> Date: Sat, 13 Jan 2024 15:26:53 -0600 Subject: [PATCH 04/16] Update test correctly --- llvm/test/CodeGen/X86/cmp-shiftX-maskX.ll | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/llvm/test/CodeGen/X86/cmp-shiftX-maskX.ll b/llvm/test/CodeGen/X86/cmp-shiftX-maskX.ll index a935bca3161b0..c74cae5fb3452 100644 --- a/llvm/test/CodeGen/X86/cmp-shiftX-maskX.ll +++ b/llvm/test/CodeGen/X86/cmp-shiftX-maskX.ll @@ -153,15 +153,23 @@ define i1 @shr_to_shl_eq_i16_s1(i16 %x) { } define i1 @shr_to_shl_eq_i16_s1_fail(i16 %x) { -; CHECK-LABEL: shr_to_shl_eq_i16_s1_fail: -; CHECK: # %bb.0: -; CHECK-NEXT: movzwl %di, %eax -; CHECK-NEXT: andl $32766, %edi # imm = 0x7FFE +; CHECK-NOBMI-LABEL: shr_to_shl_eq_i16_s1_fail: +; CHECK-NOBMI: # %bb.0: +; CHECK-NOBMI-NEXT: movzwl %di, %eax +; CHECK-NOBMI-NEXT: andl $32766, %edi # imm = 0x7FFE ; CHECK-NOBMI-NEXT: shrl %eax +; CHECK-NOBMI-NEXT: cmpw %ax, %di +; CHECK-NOBMI-NEXT: sete %al +; CHECK-NOBMI-NEXT: retq +; +; CHECK-BMI2-LABEL: shr_to_shl_eq_i16_s1_fail: +; CHECK-BMI2: # %bb.0: +; CHECK-BMI2-NEXT: movzwl %di, %eax +; CHECK-BMI2-NEXT: andl $32766, %edi # imm = 0x7FFE ; CHECK-BMI2-NEXT: rorxl $1, %eax, %eax -; CHECK-NEXT: cmpw %ax, %di -; CHECK-NEXT: sete %al -; CHECK-NEXT: retq +; CHECK-BMI2-NEXT: cmpw %ax, %di +; CHECK-BMI2-NEXT: sete %al +; CHECK-BMI2-NEXT: retq %and = and i16 %x, 32766 %shr = lshr i16 %x, 1 %r = icmp eq i16 %and, %shr >From d65d1854c71d7a7b99b6efd403dfc955c70a0741 Mon Sep 17 00:00:00 2001 From: Bryce Wilson <br...@brycemw.ca> Date: Sat, 13 Jan 2024 15:37:49 -0600 Subject: [PATCH 05/16] Add another test with memory --- llvm/test/CodeGen/X86/pr77964.ll | 36 ++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/llvm/test/CodeGen/X86/pr77964.ll b/llvm/test/CodeGen/X86/pr77964.ll index 4946d15fd8077..df20e204790ee 100644 --- a/llvm/test/CodeGen/X86/pr77964.ll +++ b/llvm/test/CodeGen/X86/pr77964.ll @@ -88,3 +88,39 @@ define zeroext i16 @checksum(ptr %0) { %51 = xor i16 %50, -1 ret i16 %51 } + +; This is expected to just load the byte and not use rorx +define i8 @extract_aligned_byte(ptr %0) { +; NOBMI-LABEL: extract_aligned_byte: +; NOBMI: # %bb.0: +; NOBMI-NEXT: movzbl 6(%rdi), %eax +; NOBMI-NEXT: retq +; +; BMI-LABEL: extract_aligned_byte: +; BMI: # %bb.0: +; BMI-NEXT: movzbl 6(%rdi), %eax +; BMI-NEXT: retq + %2 = load i64, ptr %0 + %3 = lshr i64 %2, 48 + %4 = trunc i64 %3 to i8 + ret i8 %4 +} + +define i8 @extract_unaligned_byte(ptr %0) { +; NOBMI-LABEL: extract_unaligned_byte: +; NOBMI: # %bb.0: +; NOBMI-NEXT: movq (%rdi), %rax +; NOBMI-NEXT: shrq $52, %rax +; NOBMI-NEXT: # kill: def $al killed $al killed $rax +; NOBMI-NEXT: retq +; +; BMI-LABEL: extract_unaligned_byte: +; BMI: # %bb.0: +; BMI-NEXT: rorxq $52, (%rdi), %rax +; BMI-NEXT: # kill: def $al killed $al killed $rax +; BMI-NEXT: retq + %2 = load i64, ptr %0 + %3 = lshr i64 %2, 52 + %4 = trunc i64 %3 to i8 + ret i8 %4 +} >From 9a2c538f106ab524b9ad5ca5ce0d202da5924713 Mon Sep 17 00:00:00 2001 From: Bryce Wilson <br...@brycemw.ca> Date: Sat, 13 Jan 2024 20:11:35 -0600 Subject: [PATCH 06/16] Fix nits Signed-off-by: Bryce Wilson <br...@brycemw.ca> --- llvm/lib/Target/X86/X86InstrShiftRotate.td | 2 +- llvm/test/CodeGen/X86/pr77964.ll | 19 +++++++------------ 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Target/X86/X86InstrShiftRotate.td b/llvm/lib/Target/X86/X86InstrShiftRotate.td index e294182a247ad..4c292ff266610 100644 --- a/llvm/lib/Target/X86/X86InstrShiftRotate.td +++ b/llvm/lib/Target/X86/X86InstrShiftRotate.td @@ -914,7 +914,7 @@ let Predicates = [HasBMI2] in { // A right shift by less than a smaller register size that is then // truncated to that register size can be replaced by RORX to - // preserve flags with the same execution cost + // preserve flags with the same execution cost. def : Pat<(i8 (trunc (srl GR16:$src, (i8 immle16minus8:$shamt)))), (EXTRACT_SUBREG (RORX32ri (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR16:$src, sub_16bit), imm:$shamt), sub_8bit)>; diff --git a/llvm/test/CodeGen/X86/pr77964.ll b/llvm/test/CodeGen/X86/pr77964.ll index df20e204790ee..e58e110e8bb20 100644 --- a/llvm/test/CodeGen/X86/pr77964.ll +++ b/llvm/test/CodeGen/X86/pr77964.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=NOBMI -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi2 | FileCheck %s --check-prefix=BMI +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=CHECK,NOBMI +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi2 | FileCheck %s --check-prefixes=CHECK,BMI declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) declare { i16, i1 } @llvm.uadd.with.overflow.i16(i16, i16) @@ -89,17 +89,12 @@ define zeroext i16 @checksum(ptr %0) { ret i16 %51 } -; This is expected to just load the byte and not use rorx +; This is expected to just load the byte and not use rorx. define i8 @extract_aligned_byte(ptr %0) { -; NOBMI-LABEL: extract_aligned_byte: -; NOBMI: # %bb.0: -; NOBMI-NEXT: movzbl 6(%rdi), %eax -; NOBMI-NEXT: retq -; -; BMI-LABEL: extract_aligned_byte: -; BMI: # %bb.0: -; BMI-NEXT: movzbl 6(%rdi), %eax -; BMI-NEXT: retq +; CHECK-LABEL: extract_aligned_byte: +; CHECK: # %bb.0: +; CHECK-NEXT: movzbl 6(%rdi), %eax +; CHECK-NEXT: retq %2 = load i64, ptr %0 %3 = lshr i64 %2, 48 %4 = trunc i64 %3 to i8 >From 8a4753177d47dbac3ce6a679065d7a11343a69c5 Mon Sep 17 00:00:00 2001 From: Bryce Wilson <br...@brycemw.ca> Date: Thu, 18 Jan 2024 07:54:54 -0600 Subject: [PATCH 07/16] Move conversion to a different place (profitability check not in yet) --- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 83 +++++++++++++++++ llvm/lib/Target/X86/X86InstrShiftRotate.td | 100 ++++++++++----------- 2 files changed, 133 insertions(+), 50 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 53ce720be2da4..09724363e165f 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -554,6 +554,7 @@ namespace { bool matchBitExtract(SDNode *Node); bool shrinkAndImmediate(SDNode *N); bool isMaskZeroExtended(SDNode *N) const; + bool rightShiftUncloberFlags(SDNode *N); bool tryShiftAmountMod(SDNode *N); bool tryShrinkShlLogicImm(SDNode *N); bool tryVPTERNLOG(SDNode *N); @@ -4208,6 +4209,84 @@ MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc, return CNode; } +// When the consumer of a right shift (arithmetic or logical) wouldn't +// notice the difference if the instruction was a rotate right instead +// (because the bits shifted in are truncated away), the shift can be +// replaced by the RORX instruction from BMI2. This doesn't set flags and +// can output to a different register. This increases code size in most +// cases, can have a false dependency when promoting the operand size, +// and doesn't leave the high bits in a useful state. There may be other +// situations where this transformation is profitable given those +// conditions, but currently the transformation is only made when it +// avoids spilling flags. +bool X86DAGToDAGISel::rightShiftUncloberFlags(SDNode *N) { + EVT VT = N->getValueType(0); + + printf("Evaluating\n"); + + // Target has to have BMI2 for RORX + if (!Subtarget->hasBMI2()) + return false; + + printf("Has BMI2\n"); + + // Only handle scalar shifts. + if (VT.isVector()) + return false; + + printf("Not vector\n"); + + unsigned OpSize; + if (VT == MVT::i64) OpSize = 64; + else if (VT == MVT::i32) OpSize = 32; + else if (VT == MVT::i16) OpSize = 16; + else if (VT == MVT::i8) return false; // i8 shift can't be truncated. + else llvm_unreachable("Unexpected shift size"); + + printf("Good OpSize\n"); + + unsigned TruncateSize = 0; + // This only works when the result is truncated. + for (const SDNode *User : N->uses()) { + //printf("Looking at a use. TargetOpcode is %u\n", User->isTargetOpcode()); + auto name = User->getOperationName(CurDAG); + printf("Looking at a thing %s %u %u\n", name.c_str(), User->getOpcode(), TargetOpcode::EXTRACT_SUBREG); + if (User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) + return false; + printf("It's an EXTRACT_SUBREG\n"); + EVT TuncateType = User->getValueType(0); + if (TuncateType == MVT::i32) TruncateSize = std::max(TruncateSize, 32U); + else if (TuncateType == MVT::i16) TruncateSize = std::max(TruncateSize, 16U); + else if (TuncateType == MVT::i8) TruncateSize = std::max(TruncateSize, 8U); + else return false; + } + printf("Truncates are fine\n"); + if (TruncateSize >= OpSize) + return false; + printf("Truncate size works\n"); + + // The shift must be by an immediate that wouldn't expose the zero or sign + // extended result. + auto *ShiftAmount = dyn_cast<ConstantSDNode>(N->getOperand(1)); + if (!ShiftAmount || ShiftAmount->getZExtValue() > OpSize - TruncateSize) + return false; + printf("Shift amount is good\n"); + + // Only make the replacement when it avoids clobbering used flags. If it is + // determined to be profitable in other situations in the future, add those + // checks here. + + // Make the replacement. + SDLoc DL(N); + MVT RotateSize = OpSize == 64 ? MVT::i64 : MVT::i32; + SDNode* Replacement = CurDAG->getNode(ISD::ROTR, DL, RotateSize, N->getOperand(0), N->getOperand(1)).getNode(); + ReplaceNode(N, Replacement); + CurDAG->RemoveDeadNode(N); + SelectCode(Replacement); + printf("Replacement made\n"); + return true; +} + bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) { EVT VT = N->getValueType(0); @@ -5227,6 +5306,10 @@ void X86DAGToDAGISel::Select(SDNode *Node) { return; [[fallthrough]]; case ISD::SRA: + printf("Going to evaluate SRL or SRA"); + if (rightShiftUncloberFlags(Node)) + return; + [[fallthrough]]; case ISD::SHL: if (tryShiftAmountMod(Node)) return; diff --git a/llvm/lib/Target/X86/X86InstrShiftRotate.td b/llvm/lib/Target/X86/X86InstrShiftRotate.td index 4c292ff266610..42bda8e878d00 100644 --- a/llvm/lib/Target/X86/X86InstrShiftRotate.td +++ b/llvm/lib/Target/X86/X86InstrShiftRotate.td @@ -916,59 +916,59 @@ let Predicates = [HasBMI2] in { // truncated to that register size can be replaced by RORX to // preserve flags with the same execution cost. - def : Pat<(i8 (trunc (srl GR16:$src, (i8 immle16minus8:$shamt)))), - (EXTRACT_SUBREG (RORX32ri (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR16:$src, sub_16bit), imm:$shamt), sub_8bit)>; - def : Pat<(i8 (trunc (sra GR16:$src, (i8 immle16minus8:$shamt)))), - (EXTRACT_SUBREG (RORX32ri (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR16:$src, sub_16bit), imm:$shamt), sub_8bit)>; - def : Pat<(i8 (trunc (srl GR32:$src, (i8 immle32minus8:$shamt)))), - (EXTRACT_SUBREG (RORX32ri GR32:$src, imm:$shamt), sub_8bit)>; - def : Pat<(i8 (trunc (sra GR32:$src, (i8 immle32minus8:$shamt)))), - (EXTRACT_SUBREG (RORX32ri GR32:$src, imm:$shamt), sub_8bit)>; - def : Pat<(i8 (trunc (srl GR64:$src, (i8 immle64minus8:$shamt)))), - (EXTRACT_SUBREG (RORX64ri GR64:$src, imm:$shamt), sub_8bit)>; - def : Pat<(i8 (trunc (sra GR64:$src, (i8 immle64minus8:$shamt)))), - (EXTRACT_SUBREG (RORX64ri GR64:$src, imm:$shamt), sub_8bit)>; - - - def : Pat<(i16 (trunc (srl GR32:$src, (i8 immle32minus16:$shamt)))), - (EXTRACT_SUBREG (RORX32ri GR32:$src, imm:$shamt), sub_16bit)>; - def : Pat<(i16 (trunc (sra GR32:$src, (i8 immle32minus16:$shamt)))), - (EXTRACT_SUBREG (RORX32ri GR32:$src, imm:$shamt), sub_16bit)>; - def : Pat<(i16 (trunc (srl GR64:$src, (i8 immle64minus16:$shamt)))), - (EXTRACT_SUBREG (RORX64ri GR64:$src, imm:$shamt), sub_16bit)>; - def : Pat<(i16 (trunc (sra GR64:$src, (i8 immle64minus16:$shamt)))), - (EXTRACT_SUBREG (RORX64ri GR64:$src, imm:$shamt), sub_16bit)>; - - def : Pat<(i32 (trunc (srl GR64:$src, (i8 immle64minus32:$shamt)))), - (EXTRACT_SUBREG (RORX64ri GR64:$src, imm:$shamt), sub_32bit)>; - def : Pat<(i32 (trunc (sra GR64:$src, (i8 immle64minus32:$shamt)))), - (EXTRACT_SUBREG (RORX64ri GR64:$src, imm:$shamt), sub_32bit)>; +// def : Pat<(i8 (trunc (srl GR16:$src, (i8 immle16minus8:$shamt)))), +// (EXTRACT_SUBREG (RORX32ri (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR16:$src, sub_16bit), imm:$shamt), sub_8bit)>; +// def : Pat<(i8 (trunc (sra GR16:$src, (i8 immle16minus8:$shamt)))), +// (EXTRACT_SUBREG (RORX32ri (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR16:$src, sub_16bit), imm:$shamt), sub_8bit)>; +// def : Pat<(i8 (trunc (srl GR32:$src, (i8 immle32minus8:$shamt)))), +// (EXTRACT_SUBREG (RORX32ri GR32:$src, imm:$shamt), sub_8bit)>; +// def : Pat<(i8 (trunc (sra GR32:$src, (i8 immle32minus8:$shamt)))), +// (EXTRACT_SUBREG (RORX32ri GR32:$src, imm:$shamt), sub_8bit)>; +// def : Pat<(i8 (trunc (srl GR64:$src, (i8 immle64minus8:$shamt)))), +// (EXTRACT_SUBREG (RORX64ri GR64:$src, imm:$shamt), sub_8bit)>; +// def : Pat<(i8 (trunc (sra GR64:$src, (i8 immle64minus8:$shamt)))), +// (EXTRACT_SUBREG (RORX64ri GR64:$src, imm:$shamt), sub_8bit)>; + + +// def : Pat<(i16 (trunc (srl GR32:$src, (i8 immle32minus16:$shamt)))), +// (EXTRACT_SUBREG (RORX32ri GR32:$src, imm:$shamt), sub_16bit)>; +// def : Pat<(i16 (trunc (sra GR32:$src, (i8 immle32minus16:$shamt)))), +// (EXTRACT_SUBREG (RORX32ri GR32:$src, imm:$shamt), sub_16bit)>; +// def : Pat<(i16 (trunc (srl GR64:$src, (i8 immle64minus16:$shamt)))), +// (EXTRACT_SUBREG (RORX64ri GR64:$src, imm:$shamt), sub_16bit)>; +// def : Pat<(i16 (trunc (sra GR64:$src, (i8 immle64minus16:$shamt)))), +// (EXTRACT_SUBREG (RORX64ri GR64:$src, imm:$shamt), sub_16bit)>; + +// def : Pat<(i32 (trunc (srl GR64:$src, (i8 immle64minus32:$shamt)))), +// (EXTRACT_SUBREG (RORX64ri GR64:$src, imm:$shamt), sub_32bit)>; +// def : Pat<(i32 (trunc (sra GR64:$src, (i8 immle64minus32:$shamt)))), +// (EXTRACT_SUBREG (RORX64ri GR64:$src, imm:$shamt), sub_32bit)>; // Can't expand the load - def : Pat<(i8 (trunc (srl (loadi32 addr:$src), (i8 immle32minus8:$shamt)))), - (EXTRACT_SUBREG (RORX32mi addr:$src, imm:$shamt), sub_8bit)>; - def : Pat<(i8 (trunc (sra (loadi32 addr:$src), (i8 immle32minus8:$shamt)))), - (EXTRACT_SUBREG (RORX32mi addr:$src, imm:$shamt), sub_8bit)>; - def : Pat<(i8 (trunc (srl (loadi64 addr:$src), (i8 immle64minus8:$shamt)))), - (EXTRACT_SUBREG (RORX64mi addr:$src, imm:$shamt), sub_8bit)>; - def : Pat<(i8 (trunc (sra (loadi64 addr:$src), (i8 immle64minus8:$shamt)))), - (EXTRACT_SUBREG (RORX64mi addr:$src, imm:$shamt), sub_8bit)>; - - - def : Pat<(i16 (trunc (srl (loadi32 addr:$src), (i8 immle32minus16:$shamt)))), - (EXTRACT_SUBREG (RORX32mi addr:$src, imm:$shamt), sub_16bit)>; - def : Pat<(i16 (trunc (sra (loadi32 addr:$src), (i8 immle32minus16:$shamt)))), - (EXTRACT_SUBREG (RORX32mi addr:$src, imm:$shamt), sub_16bit)>; - def : Pat<(i16 (trunc (srl (loadi64 addr:$src), (i8 immle64minus16:$shamt)))), - (EXTRACT_SUBREG (RORX64mi addr:$src, imm:$shamt), sub_16bit)>; - def : Pat<(i16 (trunc (sra (loadi64 addr:$src), (i8 immle64minus16:$shamt)))), - (EXTRACT_SUBREG (RORX64mi addr:$src, imm:$shamt), sub_16bit)>; - - def : Pat<(i32 (trunc (srl (loadi64 addr:$src), (i8 immle64minus32:$shamt)))), - (EXTRACT_SUBREG (RORX64mi addr:$src, imm:$shamt), sub_32bit)>; - def : Pat<(i32 (trunc (sra (loadi64 addr:$src), (i8 immle64minus32:$shamt)))), - (EXTRACT_SUBREG (RORX64mi addr:$src, imm:$shamt), sub_32bit)>; +// def : Pat<(i8 (trunc (srl (loadi32 addr:$src), (i8 immle32minus8:$shamt)))), +// (EXTRACT_SUBREG (RORX32mi addr:$src, imm:$shamt), sub_8bit)>; +// def : Pat<(i8 (trunc (sra (loadi32 addr:$src), (i8 immle32minus8:$shamt)))), +// (EXTRACT_SUBREG (RORX32mi addr:$src, imm:$shamt), sub_8bit)>; +// def : Pat<(i8 (trunc (srl (loadi64 addr:$src), (i8 immle64minus8:$shamt)))), +// (EXTRACT_SUBREG (RORX64mi addr:$src, imm:$shamt), sub_8bit)>; +// def : Pat<(i8 (trunc (sra (loadi64 addr:$src), (i8 immle64minus8:$shamt)))), +// (EXTRACT_SUBREG (RORX64mi addr:$src, imm:$shamt), sub_8bit)>; + + +// def : Pat<(i16 (trunc (srl (loadi32 addr:$src), (i8 immle32minus16:$shamt)))), +// (EXTRACT_SUBREG (RORX32mi addr:$src, imm:$shamt), sub_16bit)>; +// def : Pat<(i16 (trunc (sra (loadi32 addr:$src), (i8 immle32minus16:$shamt)))), +// (EXTRACT_SUBREG (RORX32mi addr:$src, imm:$shamt), sub_16bit)>; +// def : Pat<(i16 (trunc (srl (loadi64 addr:$src), (i8 immle64minus16:$shamt)))), +// (EXTRACT_SUBREG (RORX64mi addr:$src, imm:$shamt), sub_16bit)>; +// def : Pat<(i16 (trunc (sra (loadi64 addr:$src), (i8 immle64minus16:$shamt)))), +// (EXTRACT_SUBREG (RORX64mi addr:$src, imm:$shamt), sub_16bit)>; + +// def : Pat<(i32 (trunc (srl (loadi64 addr:$src), (i8 immle64minus32:$shamt)))), +// (EXTRACT_SUBREG (RORX64mi addr:$src, imm:$shamt), sub_32bit)>; +// def : Pat<(i32 (trunc (sra (loadi64 addr:$src), (i8 immle64minus32:$shamt)))), +// (EXTRACT_SUBREG (RORX64mi addr:$src, imm:$shamt), sub_32bit)>; } def : Pat<(rotr (loadi32 addr:$src), (i8 imm:$shamt)), >From 78eeeab5ef31bd51053d294ae8ef64e0af6cd085 Mon Sep 17 00:00:00 2001 From: Bryce Wilson <bryce.wil...@oldmissioncapital.com> Date: Thu, 18 Jan 2024 08:27:41 -0600 Subject: [PATCH 08/16] Fix clang-format, remove test prints --- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 40 ++++++----- llvm/lib/Target/X86/X86InstrShiftRotate.td | 78 ---------------------- 2 files changed, 22 insertions(+), 96 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 09724363e165f..406dadc29b389 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -4237,40 +4237,43 @@ bool X86DAGToDAGISel::rightShiftUncloberFlags(SDNode *N) { printf("Not vector\n"); unsigned OpSize; - if (VT == MVT::i64) OpSize = 64; - else if (VT == MVT::i32) OpSize = 32; - else if (VT == MVT::i16) OpSize = 16; - else if (VT == MVT::i8) return false; // i8 shift can't be truncated. - else llvm_unreachable("Unexpected shift size"); + if (VT == MVT::i64) + OpSize = 64; + else if (VT == MVT::i32) + OpSize = 32; + else if (VT == MVT::i16) + OpSize = 16; + else if (VT == MVT::i8) + return false; // i8 shift can't be truncated. + else + llvm_unreachable("Unexpected shift size"); printf("Good OpSize\n"); unsigned TruncateSize = 0; // This only works when the result is truncated. for (const SDNode *User : N->uses()) { - //printf("Looking at a use. TargetOpcode is %u\n", User->isTargetOpcode()); auto name = User->getOperationName(CurDAG); - printf("Looking at a thing %s %u %u\n", name.c_str(), User->getOpcode(), TargetOpcode::EXTRACT_SUBREG); if (User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) return false; - printf("It's an EXTRACT_SUBREG\n"); EVT TuncateType = User->getValueType(0); - if (TuncateType == MVT::i32) TruncateSize = std::max(TruncateSize, 32U); - else if (TuncateType == MVT::i16) TruncateSize = std::max(TruncateSize, 16U); - else if (TuncateType == MVT::i8) TruncateSize = std::max(TruncateSize, 8U); - else return false; + if (TuncateType == MVT::i32) + TruncateSize = std::max(TruncateSize, 32U); + else if (TuncateType == MVT::i16) + TruncateSize = std::max(TruncateSize, 16U); + else if (TuncateType == MVT::i8) + TruncateSize = std::max(TruncateSize, 8U); + else + return false; } - printf("Truncates are fine\n"); if (TruncateSize >= OpSize) return false; - printf("Truncate size works\n"); // The shift must be by an immediate that wouldn't expose the zero or sign // extended result. auto *ShiftAmount = dyn_cast<ConstantSDNode>(N->getOperand(1)); if (!ShiftAmount || ShiftAmount->getZExtValue() > OpSize - TruncateSize) return false; - printf("Shift amount is good\n"); // Only make the replacement when it avoids clobbering used flags. If it is // determined to be profitable in other situations in the future, add those @@ -4279,11 +4282,13 @@ bool X86DAGToDAGISel::rightShiftUncloberFlags(SDNode *N) { // Make the replacement. SDLoc DL(N); MVT RotateSize = OpSize == 64 ? MVT::i64 : MVT::i32; - SDNode* Replacement = CurDAG->getNode(ISD::ROTR, DL, RotateSize, N->getOperand(0), N->getOperand(1)).getNode(); + SDNode *Replacement = CurDAG + ->getNode(ISD::ROTR, DL, RotateSize, + N->getOperand(0), N->getOperand(1)) + .getNode(); ReplaceNode(N, Replacement); CurDAG->RemoveDeadNode(N); SelectCode(Replacement); - printf("Replacement made\n"); return true; } @@ -5306,7 +5311,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) { return; [[fallthrough]]; case ISD::SRA: - printf("Going to evaluate SRL or SRA"); if (rightShiftUncloberFlags(Node)) return; [[fallthrough]]; diff --git a/llvm/lib/Target/X86/X86InstrShiftRotate.td b/llvm/lib/Target/X86/X86InstrShiftRotate.td index 42bda8e878d00..f951894db1890 100644 --- a/llvm/lib/Target/X86/X86InstrShiftRotate.td +++ b/llvm/lib/Target/X86/X86InstrShiftRotate.td @@ -879,26 +879,6 @@ let Predicates = [HasBMI2, HasEGPR, In64BitMode] in { defm SHLX64 : bmi_shift<"shlx{q}", GR64, i64mem, "_EVEX">, T8, PD, REX_W, EVEX; } - -def immle16minus8 : ImmLeaf<i8, [{ - return Imm <= 16 - 8; -}]>; -def immle32minus8 : ImmLeaf<i8, [{ - return Imm <= 32 - 8; -}]>; -def immle64minus8 : ImmLeaf<i8, [{ - return Imm <= 64 - 8; -}]>; -def immle32minus16 : ImmLeaf<i8, [{ - return Imm <= 32 - 16; -}]>; -def immle64minus16 : ImmLeaf<i8, [{ - return Imm <= 64 - 16; -}]>; -def immle64minus32 : ImmLeaf<i8, [{ - return Imm <= 64 - 32; -}]>; - let Predicates = [HasBMI2] in { // Prefer RORX which is non-destructive and doesn't update EFLAGS. let AddedComplexity = 10 in { @@ -911,64 +891,6 @@ let Predicates = [HasBMI2] in { (RORX32ri GR32:$src, (ROT32L2R_imm8 imm:$shamt))>; def : Pat<(rotl GR64:$src, (i8 imm:$shamt)), (RORX64ri GR64:$src, (ROT64L2R_imm8 imm:$shamt))>; - - // A right shift by less than a smaller register size that is then - // truncated to that register size can be replaced by RORX to - // preserve flags with the same execution cost. - -// def : Pat<(i8 (trunc (srl GR16:$src, (i8 immle16minus8:$shamt)))), -// (EXTRACT_SUBREG (RORX32ri (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR16:$src, sub_16bit), imm:$shamt), sub_8bit)>; -// def : Pat<(i8 (trunc (sra GR16:$src, (i8 immle16minus8:$shamt)))), -// (EXTRACT_SUBREG (RORX32ri (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR16:$src, sub_16bit), imm:$shamt), sub_8bit)>; -// def : Pat<(i8 (trunc (srl GR32:$src, (i8 immle32minus8:$shamt)))), -// (EXTRACT_SUBREG (RORX32ri GR32:$src, imm:$shamt), sub_8bit)>; -// def : Pat<(i8 (trunc (sra GR32:$src, (i8 immle32minus8:$shamt)))), -// (EXTRACT_SUBREG (RORX32ri GR32:$src, imm:$shamt), sub_8bit)>; -// def : Pat<(i8 (trunc (srl GR64:$src, (i8 immle64minus8:$shamt)))), -// (EXTRACT_SUBREG (RORX64ri GR64:$src, imm:$shamt), sub_8bit)>; -// def : Pat<(i8 (trunc (sra GR64:$src, (i8 immle64minus8:$shamt)))), -// (EXTRACT_SUBREG (RORX64ri GR64:$src, imm:$shamt), sub_8bit)>; - - -// def : Pat<(i16 (trunc (srl GR32:$src, (i8 immle32minus16:$shamt)))), -// (EXTRACT_SUBREG (RORX32ri GR32:$src, imm:$shamt), sub_16bit)>; -// def : Pat<(i16 (trunc (sra GR32:$src, (i8 immle32minus16:$shamt)))), -// (EXTRACT_SUBREG (RORX32ri GR32:$src, imm:$shamt), sub_16bit)>; -// def : Pat<(i16 (trunc (srl GR64:$src, (i8 immle64minus16:$shamt)))), -// (EXTRACT_SUBREG (RORX64ri GR64:$src, imm:$shamt), sub_16bit)>; -// def : Pat<(i16 (trunc (sra GR64:$src, (i8 immle64minus16:$shamt)))), -// (EXTRACT_SUBREG (RORX64ri GR64:$src, imm:$shamt), sub_16bit)>; - -// def : Pat<(i32 (trunc (srl GR64:$src, (i8 immle64minus32:$shamt)))), -// (EXTRACT_SUBREG (RORX64ri GR64:$src, imm:$shamt), sub_32bit)>; -// def : Pat<(i32 (trunc (sra GR64:$src, (i8 immle64minus32:$shamt)))), -// (EXTRACT_SUBREG (RORX64ri GR64:$src, imm:$shamt), sub_32bit)>; - - - // Can't expand the load -// def : Pat<(i8 (trunc (srl (loadi32 addr:$src), (i8 immle32minus8:$shamt)))), -// (EXTRACT_SUBREG (RORX32mi addr:$src, imm:$shamt), sub_8bit)>; -// def : Pat<(i8 (trunc (sra (loadi32 addr:$src), (i8 immle32minus8:$shamt)))), -// (EXTRACT_SUBREG (RORX32mi addr:$src, imm:$shamt), sub_8bit)>; -// def : Pat<(i8 (trunc (srl (loadi64 addr:$src), (i8 immle64minus8:$shamt)))), -// (EXTRACT_SUBREG (RORX64mi addr:$src, imm:$shamt), sub_8bit)>; -// def : Pat<(i8 (trunc (sra (loadi64 addr:$src), (i8 immle64minus8:$shamt)))), -// (EXTRACT_SUBREG (RORX64mi addr:$src, imm:$shamt), sub_8bit)>; - - -// def : Pat<(i16 (trunc (srl (loadi32 addr:$src), (i8 immle32minus16:$shamt)))), -// (EXTRACT_SUBREG (RORX32mi addr:$src, imm:$shamt), sub_16bit)>; -// def : Pat<(i16 (trunc (sra (loadi32 addr:$src), (i8 immle32minus16:$shamt)))), -// (EXTRACT_SUBREG (RORX32mi addr:$src, imm:$shamt), sub_16bit)>; -// def : Pat<(i16 (trunc (srl (loadi64 addr:$src), (i8 immle64minus16:$shamt)))), -// (EXTRACT_SUBREG (RORX64mi addr:$src, imm:$shamt), sub_16bit)>; -// def : Pat<(i16 (trunc (sra (loadi64 addr:$src), (i8 immle64minus16:$shamt)))), -// (EXTRACT_SUBREG (RORX64mi addr:$src, imm:$shamt), sub_16bit)>; - -// def : Pat<(i32 (trunc (srl (loadi64 addr:$src), (i8 immle64minus32:$shamt)))), -// (EXTRACT_SUBREG (RORX64mi addr:$src, imm:$shamt), sub_32bit)>; -// def : Pat<(i32 (trunc (sra (loadi64 addr:$src), (i8 immle64minus32:$shamt)))), -// (EXTRACT_SUBREG (RORX64mi addr:$src, imm:$shamt), sub_32bit)>; } def : Pat<(rotr (loadi32 addr:$src), (i8 imm:$shamt)), >From 43a0910fc3afbcccd270cd55aa390a47be358799 Mon Sep 17 00:00:00 2001 From: Bryce Wilson <bryce.wil...@oldmissioncapital.com> Date: Tue, 23 Jan 2024 11:47:34 -0600 Subject: [PATCH 09/16] Add proper profitability check --- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 48 +++++---- llvm/test/CodeGen/X86/atomic-unordered.ll | 3 +- llvm/test/CodeGen/X86/bmi2.ll | 6 +- llvm/test/CodeGen/X86/cmp-shiftX-maskX.ll | 25 ++--- llvm/test/CodeGen/X86/pr35636.ll | 4 +- llvm/test/CodeGen/X86/pr77964.ll | 18 ++-- llvm/test/CodeGen/X86/vector-trunc-ssat.ll | 116 ++++++++++----------- 7 files changed, 107 insertions(+), 113 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 73085dd6b82b5..d6b6ce653f8ed 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -4213,33 +4213,25 @@ MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc, return CNode; } -// When the consumer of a right shift (arithmetic or logical) wouldn't -// notice the difference if the instruction was a rotate right instead -// (because the bits shifted in are truncated away), the shift can be -// replaced by the RORX instruction from BMI2. This doesn't set flags and -// can output to a different register. This increases code size in most -// cases, can have a false dependency when promoting the operand size, -// and doesn't leave the high bits in a useful state. There may be other -// situations where this transformation is profitable given those -// conditions, but currently the transformation is only made when it -// avoids spilling flags. +// When the consumer of a right shift (arithmetic or logical) wouldn't notice +// the difference if the instruction was a rotate right instead (because the +// bits shifted in are truncated away), the shift can be replaced by the RORX +// instruction from BMI2. This doesn't set flags and can output to a different +// register. However, this increases code size in most cases, and doesn't leave +// the high bits in a useful state. There may be other situations where this +// transformation is profitable given those conditions, but currently the +// transformation is only made when it likely avoids spilling flags. bool X86DAGToDAGISel::rightShiftUncloberFlags(SDNode *N) { EVT VT = N->getValueType(0); - printf("Evaluating\n"); - // Target has to have BMI2 for RORX if (!Subtarget->hasBMI2()) return false; - printf("Has BMI2\n"); - // Only handle scalar shifts. if (VT.isVector()) return false; - printf("Not vector\n"); - unsigned OpSize; if (VT == MVT::i64) OpSize = 64; @@ -4252,8 +4244,6 @@ bool X86DAGToDAGISel::rightShiftUncloberFlags(SDNode *N) { else llvm_unreachable("Unexpected shift size"); - printf("Good OpSize\n"); - unsigned TruncateSize = 0; // This only works when the result is truncated. for (const SDNode *User : N->uses()) { @@ -4279,9 +4269,25 @@ bool X86DAGToDAGISel::rightShiftUncloberFlags(SDNode *N) { if (!ShiftAmount || ShiftAmount->getZExtValue() > OpSize - TruncateSize) return false; - // Only make the replacement when it avoids clobbering used flags. If it is - // determined to be profitable in other situations in the future, add those - // checks here. + // Only make the replacement when it avoids clobbering used flags. This is a + // similar heuristic as used in the conversion to LEA, namely looking at the + // operand for an instruction that creates flags where those flags are used. + // This will have both false positives and false negatives. Ideally, both of + // these happen later on. Perhaps in copy to flags lowering or in register + // allocation. + bool MightClobberFlags = false; + SDNode *Input = N->getOperand(0).getNode(); + for (auto Use : Input->uses()) { + if (Use->getOpcode() == ISD::CopyToReg) { + auto RegisterNode = dyn_cast<RegisterSDNode>(Use->getOperand(1).getNode()); + if (RegisterNode && RegisterNode->getReg() == X86::EFLAGS) { + MightClobberFlags = true; + break; + } + } + } + if (!MightClobberFlags) + return false; // Make the replacement. SDLoc DL(N); diff --git a/llvm/test/CodeGen/X86/atomic-unordered.ll b/llvm/test/CodeGen/X86/atomic-unordered.ll index c867817fd2dff..df123be53474f 100644 --- a/llvm/test/CodeGen/X86/atomic-unordered.ll +++ b/llvm/test/CodeGen/X86/atomic-unordered.ll @@ -2062,7 +2062,8 @@ define i32 @split_load(ptr %p) { ; CHECK-O3-LABEL: split_load: ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: movq (%rdi), %rax -; CHECK-O3-NEXT: rorxq $32, %rax, %rcx +; CHECK-O3-NEXT: movq %rax, %rcx +; CHECK-O3-NEXT: shrq $32, %rcx ; CHECK-O3-NEXT: orl %eax, %ecx ; CHECK-O3-NEXT: movzbl %cl, %eax ; CHECK-O3-NEXT: retq diff --git a/llvm/test/CodeGen/X86/bmi2.ll b/llvm/test/CodeGen/X86/bmi2.ll index be37b62a2e236..cabeebb0c3f36 100644 --- a/llvm/test/CodeGen/X86/bmi2.ll +++ b/llvm/test/CodeGen/X86/bmi2.ll @@ -406,7 +406,8 @@ define i32 @mulx32(i32 %x, i32 %y, ptr %p) { ; X64-NEXT: addl %edi, %edi ; X64-NEXT: leal (%rsi,%rsi), %eax ; X64-NEXT: imulq %rdi, %rax -; X64-NEXT: rorxq $32, %rax, %rcx +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: shrq $32, %rcx ; X64-NEXT: movl %ecx, (%rdx) ; X64-NEXT: # kill: def $eax killed $eax killed $rax ; X64-NEXT: retq @@ -452,7 +453,8 @@ define i32 @mulx32_load(i32 %x, ptr %y, ptr %p) { ; X64-NEXT: leal (%rdi,%rdi), %eax ; X64-NEXT: movl (%rsi), %ecx ; X64-NEXT: imulq %rcx, %rax -; X64-NEXT: rorxq $32, %rax, %rcx +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: shrq $32, %rcx ; X64-NEXT: movl %ecx, (%rdx) ; X64-NEXT: # kill: def $eax killed $eax killed $rax ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/cmp-shiftX-maskX.ll b/llvm/test/CodeGen/X86/cmp-shiftX-maskX.ll index c74cae5fb3452..7996454a0158e 100644 --- a/llvm/test/CodeGen/X86/cmp-shiftX-maskX.ll +++ b/llvm/test/CodeGen/X86/cmp-shiftX-maskX.ll @@ -153,23 +153,14 @@ define i1 @shr_to_shl_eq_i16_s1(i16 %x) { } define i1 @shr_to_shl_eq_i16_s1_fail(i16 %x) { -; CHECK-NOBMI-LABEL: shr_to_shl_eq_i16_s1_fail: -; CHECK-NOBMI: # %bb.0: -; CHECK-NOBMI-NEXT: movzwl %di, %eax -; CHECK-NOBMI-NEXT: andl $32766, %edi # imm = 0x7FFE -; CHECK-NOBMI-NEXT: shrl %eax -; CHECK-NOBMI-NEXT: cmpw %ax, %di -; CHECK-NOBMI-NEXT: sete %al -; CHECK-NOBMI-NEXT: retq -; -; CHECK-BMI2-LABEL: shr_to_shl_eq_i16_s1_fail: -; CHECK-BMI2: # %bb.0: -; CHECK-BMI2-NEXT: movzwl %di, %eax -; CHECK-BMI2-NEXT: andl $32766, %edi # imm = 0x7FFE -; CHECK-BMI2-NEXT: rorxl $1, %eax, %eax -; CHECK-BMI2-NEXT: cmpw %ax, %di -; CHECK-BMI2-NEXT: sete %al -; CHECK-BMI2-NEXT: retq +; CHECK-LABEL: shr_to_shl_eq_i16_s1_fail: +; CHECK: # %bb.0: +; CHECK-NEXT: movzwl %di, %eax +; CHECK-NEXT: andl $32766, %edi # imm = 0x7FFE +; CHECK-NEXT: shrl %eax +; CHECK-NEXT: cmpw %ax, %di +; CHECK-NEXT: sete %al +; CHECK-NEXT: retq %and = and i16 %x, 32766 %shr = lshr i16 %x, 1 %r = icmp eq i16 %and, %shr diff --git a/llvm/test/CodeGen/X86/pr35636.ll b/llvm/test/CodeGen/X86/pr35636.ll index 8b8a25eb5632c..0b7d64f38c780 100644 --- a/llvm/test/CodeGen/X86/pr35636.ll +++ b/llvm/test/CodeGen/X86/pr35636.ll @@ -10,7 +10,7 @@ define void @_Z15uint64_to_asciimPc(i64 %arg) { ; HSW-NEXT: mulxq %rax, %rax, %rax ; HSW-NEXT: shrq $42, %rax ; HSW-NEXT: imulq $281474977, %rax, %rax # imm = 0x10C6F7A1 -; HSW-NEXT: rorxq $20, %rax, %rax +; HSW-NEXT: shrq $20, %rax ; HSW-NEXT: leal (%rax,%rax,4), %eax ; HSW-NEXT: addl $5, %eax ; HSW-NEXT: andl $134217727, %eax # imm = 0x7FFFFFF @@ -27,7 +27,7 @@ define void @_Z15uint64_to_asciimPc(i64 %arg) { ; ZN-NEXT: mulxq %rax, %rax, %rax ; ZN-NEXT: shrq $42, %rax ; ZN-NEXT: imulq $281474977, %rax, %rax # imm = 0x10C6F7A1 -; ZN-NEXT: rorxq $20, %rax, %rax +; ZN-NEXT: shrq $20, %rax ; ZN-NEXT: leal 5(%rax,%rax,4), %eax ; ZN-NEXT: andl $134217727, %eax # imm = 0x7FFFFFF ; ZN-NEXT: leal (%rax,%rax,4), %eax diff --git a/llvm/test/CodeGen/X86/pr77964.ll b/llvm/test/CodeGen/X86/pr77964.ll index e58e110e8bb20..f06f54c38f003 100644 --- a/llvm/test/CodeGen/X86/pr77964.ll +++ b/llvm/test/CodeGen/X86/pr77964.ll @@ -102,18 +102,12 @@ define i8 @extract_aligned_byte(ptr %0) { } define i8 @extract_unaligned_byte(ptr %0) { -; NOBMI-LABEL: extract_unaligned_byte: -; NOBMI: # %bb.0: -; NOBMI-NEXT: movq (%rdi), %rax -; NOBMI-NEXT: shrq $52, %rax -; NOBMI-NEXT: # kill: def $al killed $al killed $rax -; NOBMI-NEXT: retq -; -; BMI-LABEL: extract_unaligned_byte: -; BMI: # %bb.0: -; BMI-NEXT: rorxq $52, (%rdi), %rax -; BMI-NEXT: # kill: def $al killed $al killed $rax -; BMI-NEXT: retq +; CHECK-LABEL: extract_unaligned_byte: +; CHECK: # %bb.0: +; CHECK-NEXT: movq (%rdi), %rax +; CHECK-NEXT: shrq $52, %rax +; CHECK-NEXT: # kill: def $al killed $al killed $rax +; CHECK-NEXT: retq %2 = load i64, ptr %0 %3 = lshr i64 %2, 52 %4 = trunc i64 %3 to i8 diff --git a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll index 2837be16b6b2b..14f724fc3b8c7 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll @@ -4843,72 +4843,72 @@ define void @trunc_ssat_v16i32_v16i24(<16 x i32> %x, ptr %y) nounwind { ; SKX-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; SKX-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; SKX-NEXT: vextracti32x4 $3, %zmm0, %xmm1 -; SKX-NEXT: vpextrd $3, %xmm1, %r8d -; SKX-NEXT: movw %r8w, 45(%rdi) -; SKX-NEXT: vpextrd $2, %xmm1, %r9d -; SKX-NEXT: movw %r9w, 42(%rdi) -; SKX-NEXT: vpextrd $1, %xmm1, %r10d -; SKX-NEXT: movw %r10w, 39(%rdi) +; SKX-NEXT: vpextrd $3, %xmm1, %r15d +; SKX-NEXT: movw %r15w, 45(%rdi) +; SKX-NEXT: vpextrd $2, %xmm1, %r14d +; SKX-NEXT: movw %r14w, 42(%rdi) +; SKX-NEXT: vpextrd $1, %xmm1, %ebp +; SKX-NEXT: movw %bp, 39(%rdi) ; SKX-NEXT: vmovd %xmm1, %r11d ; SKX-NEXT: movw %r11w, 36(%rdi) ; SKX-NEXT: vextracti32x4 $2, %zmm0, %xmm1 ; SKX-NEXT: vpextrd $3, %xmm1, %ebx ; SKX-NEXT: movw %bx, 33(%rdi) -; SKX-NEXT: vpextrd $2, %xmm1, %ebp -; SKX-NEXT: movw %bp, 30(%rdi) -; SKX-NEXT: vpextrd $1, %xmm1, %r14d -; SKX-NEXT: movw %r14w, 27(%rdi) -; SKX-NEXT: vmovd %xmm1, %r15d -; SKX-NEXT: vpextrd $3, %xmm0, %eax -; SKX-NEXT: movw %r15w, 24(%rdi) -; SKX-NEXT: movw %ax, 9(%rdi) -; SKX-NEXT: vpextrd $2, %xmm0, %ecx -; SKX-NEXT: vpextrd $1, %xmm0, %edx -; SKX-NEXT: movw %cx, 6(%rdi) -; SKX-NEXT: movw %dx, 3(%rdi) -; SKX-NEXT: vmovd %xmm0, %esi -; SKX-NEXT: movw %si, (%rdi) -; SKX-NEXT: rorxl $16, %r8d, %r8d -; SKX-NEXT: movb %r8b, 47(%rdi) -; SKX-NEXT: rorxl $16, %r9d, %r8d -; SKX-NEXT: movb %r8b, 44(%rdi) -; SKX-NEXT: rorxl $16, %r10d, %r8d -; SKX-NEXT: movb %r8b, 41(%rdi) -; SKX-NEXT: rorxl $16, %r11d, %r8d -; SKX-NEXT: movb %r8b, 38(%rdi) -; SKX-NEXT: rorxl $16, %ebx, %r8d -; SKX-NEXT: movb %r8b, 35(%rdi) -; SKX-NEXT: rorxl $16, %ebp, %r8d -; SKX-NEXT: movb %r8b, 32(%rdi) -; SKX-NEXT: rorxl $16, %r14d, %r8d -; SKX-NEXT: movb %r8b, 29(%rdi) -; SKX-NEXT: rorxl $16, %r15d, %r8d +; SKX-NEXT: vpextrd $2, %xmm1, %r10d +; SKX-NEXT: movw %r10w, 30(%rdi) +; SKX-NEXT: vpextrd $1, %xmm1, %r9d +; SKX-NEXT: movw %r9w, 27(%rdi) +; SKX-NEXT: vmovd %xmm1, %r8d +; SKX-NEXT: vpextrd $3, %xmm0, %edx +; SKX-NEXT: movw %r8w, 24(%rdi) +; SKX-NEXT: movw %dx, 9(%rdi) +; SKX-NEXT: vpextrd $2, %xmm0, %esi +; SKX-NEXT: vpextrd $1, %xmm0, %eax +; SKX-NEXT: movw %si, 6(%rdi) +; SKX-NEXT: movw %ax, 3(%rdi) +; SKX-NEXT: vmovd %xmm0, %ecx +; SKX-NEXT: movw %cx, (%rdi) +; SKX-NEXT: shrl $16, %r15d +; SKX-NEXT: movb %r15b, 47(%rdi) +; SKX-NEXT: shrl $16, %r14d +; SKX-NEXT: movb %r14b, 44(%rdi) +; SKX-NEXT: shrl $16, %ebp +; SKX-NEXT: movb %bpl, 41(%rdi) +; SKX-NEXT: shrl $16, %r11d +; SKX-NEXT: movb %r11b, 38(%rdi) +; SKX-NEXT: shrl $16, %ebx +; SKX-NEXT: movb %bl, 35(%rdi) +; SKX-NEXT: shrl $16, %r10d +; SKX-NEXT: movb %r10b, 32(%rdi) +; SKX-NEXT: shrl $16, %r9d +; SKX-NEXT: movb %r9b, 29(%rdi) +; SKX-NEXT: shrl $16, %r8d ; SKX-NEXT: movb %r8b, 26(%rdi) ; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0 -; SKX-NEXT: vpextrd $3, %xmm0, %r8d -; SKX-NEXT: movw %r8w, 21(%rdi) -; SKX-NEXT: vpextrd $2, %xmm0, %r9d -; SKX-NEXT: movw %r9w, 18(%rdi) -; SKX-NEXT: vpextrd $1, %xmm0, %r10d -; SKX-NEXT: movw %r10w, 15(%rdi) -; SKX-NEXT: vmovd %xmm0, %r11d -; SKX-NEXT: movw %r11w, 12(%rdi) -; SKX-NEXT: rorxl $16, %eax, %eax -; SKX-NEXT: movb %al, 11(%rdi) -; SKX-NEXT: rorxl $16, %ecx, %eax -; SKX-NEXT: movb %al, 8(%rdi) -; SKX-NEXT: rorxl $16, %edx, %eax +; SKX-NEXT: vpextrd $3, %xmm0, %r11d +; SKX-NEXT: movw %r11w, 21(%rdi) +; SKX-NEXT: vpextrd $2, %xmm0, %r10d +; SKX-NEXT: movw %r10w, 18(%rdi) +; SKX-NEXT: vpextrd $1, %xmm0, %r9d +; SKX-NEXT: movw %r9w, 15(%rdi) +; SKX-NEXT: vmovd %xmm0, %r8d +; SKX-NEXT: movw %r8w, 12(%rdi) +; SKX-NEXT: shrl $16, %edx +; SKX-NEXT: movb %dl, 11(%rdi) +; SKX-NEXT: shrl $16, %esi +; SKX-NEXT: movb %sil, 8(%rdi) +; SKX-NEXT: shrl $16, %eax ; SKX-NEXT: movb %al, 5(%rdi) -; SKX-NEXT: rorxl $16, %esi, %eax -; SKX-NEXT: movb %al, 2(%rdi) -; SKX-NEXT: rorxl $16, %r8d, %eax -; SKX-NEXT: movb %al, 23(%rdi) -; SKX-NEXT: rorxl $16, %r9d, %eax -; SKX-NEXT: movb %al, 20(%rdi) -; SKX-NEXT: rorxl $16, %r10d, %eax -; SKX-NEXT: movb %al, 17(%rdi) -; SKX-NEXT: rorxl $16, %r11d, %eax -; SKX-NEXT: movb %al, 14(%rdi) +; SKX-NEXT: shrl $16, %ecx +; SKX-NEXT: movb %cl, 2(%rdi) +; SKX-NEXT: shrl $16, %r11d +; SKX-NEXT: movb %r11b, 23(%rdi) +; SKX-NEXT: shrl $16, %r10d +; SKX-NEXT: movb %r10b, 20(%rdi) +; SKX-NEXT: shrl $16, %r9d +; SKX-NEXT: movb %r9b, 17(%rdi) +; SKX-NEXT: shrl $16, %r8d +; SKX-NEXT: movb %r8b, 14(%rdi) ; SKX-NEXT: popq %rbx ; SKX-NEXT: popq %r14 ; SKX-NEXT: popq %r15 >From b14b08abba26d93726dec892ea64ed25d1ab217a Mon Sep 17 00:00:00 2001 From: Bryce Wilson <bryce.wil...@oldmissioncapital.com> Date: Tue, 23 Jan 2024 11:57:18 -0600 Subject: [PATCH 10/16] clang-format --- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index d6b6ce653f8ed..5d90e748a6da6 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -4279,7 +4279,8 @@ bool X86DAGToDAGISel::rightShiftUncloberFlags(SDNode *N) { SDNode *Input = N->getOperand(0).getNode(); for (auto Use : Input->uses()) { if (Use->getOpcode() == ISD::CopyToReg) { - auto RegisterNode = dyn_cast<RegisterSDNode>(Use->getOperand(1).getNode()); + auto RegisterNode = + dyn_cast<RegisterSDNode>(Use->getOperand(1).getNode()); if (RegisterNode && RegisterNode->getReg() == X86::EFLAGS) { MightClobberFlags = true; break; >From 4f4dd7d3d14d557dc6bfb76b1f1388ebc94b2726 Mon Sep 17 00:00:00 2001 From: Bryce Wilson <bryce.wil...@oldmissioncapital.com> Date: Tue, 23 Jan 2024 12:19:26 -0600 Subject: [PATCH 11/16] Add pointer --- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 5d90e748a6da6..63a7cf2efea04 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -4279,7 +4279,7 @@ bool X86DAGToDAGISel::rightShiftUncloberFlags(SDNode *N) { SDNode *Input = N->getOperand(0).getNode(); for (auto Use : Input->uses()) { if (Use->getOpcode() == ISD::CopyToReg) { - auto RegisterNode = + auto *RegisterNode = dyn_cast<RegisterSDNode>(Use->getOperand(1).getNode()); if (RegisterNode && RegisterNode->getReg() == X86::EFLAGS) { MightClobberFlags = true; >From a3ad2b5d9e7da4580f0b29ca313989978d605b5f Mon Sep 17 00:00:00 2001 From: Bryce Wilson <bryce.wil...@oldmissioncapital.com> Date: Wed, 24 Jan 2024 16:24:02 -0600 Subject: [PATCH 12/16] Fix bug --- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index fab2e131429da..99a1470c60fba 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -4252,7 +4252,7 @@ bool X86DAGToDAGISel::rightShiftUncloberFlags(SDNode *N) { // This only works when the result is truncated. for (const SDNode *User : N->uses()) { auto name = User->getOperationName(CurDAG); - if (User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) + if (!User.isMachineOpcode() || User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) return false; EVT TuncateType = User->getValueType(0); if (TuncateType == MVT::i32) >From afaff4ad61723e0dee1d9947068fff0b5bdc3fb4 Mon Sep 17 00:00:00 2001 From: Bryce Wilson <bryce.wil...@oldmissioncapital.com> Date: Wed, 24 Jan 2024 16:27:26 -0600 Subject: [PATCH 13/16] clang-format --- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 99a1470c60fba..17a57f70b5354 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -4252,7 +4252,8 @@ bool X86DAGToDAGISel::rightShiftUncloberFlags(SDNode *N) { // This only works when the result is truncated. for (const SDNode *User : N->uses()) { auto name = User->getOperationName(CurDAG); - if (!User.isMachineOpcode() || User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) + if (!User.isMachineOpcode() || + User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) return false; EVT TuncateType = User->getValueType(0); if (TuncateType == MVT::i32) >From 349af52eba94bd69b298ed5b86a0ff1ef493a1fd Mon Sep 17 00:00:00 2001 From: Bryce Wilson <bryce.wil...@oldmissioncapital.com> Date: Wed, 24 Jan 2024 16:43:28 -0600 Subject: [PATCH 14/16] typo --- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 17a57f70b5354..f9d4edc7a8464 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -4252,7 +4252,7 @@ bool X86DAGToDAGISel::rightShiftUncloberFlags(SDNode *N) { // This only works when the result is truncated. for (const SDNode *User : N->uses()) { auto name = User->getOperationName(CurDAG); - if (!User.isMachineOpcode() || + if (!User->isMachineOpcode() || User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) return false; EVT TuncateType = User->getValueType(0); >From 1c990b34374b8377b9dd223c6dd4d15f2eb1c140 Mon Sep 17 00:00:00 2001 From: Bryce Wilson <bryce.wil...@oldmissioncapital.com> Date: Thu, 25 Jan 2024 08:57:01 -0600 Subject: [PATCH 15/16] Address comments --- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index f9d4edc7a8464..028c49c0e4df6 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -554,7 +554,7 @@ namespace { bool matchBitExtract(SDNode *Node); bool shrinkAndImmediate(SDNode *N); bool isMaskZeroExtended(SDNode *N) const; - bool rightShiftUncloberFlags(SDNode *N); + bool rightShiftUnclobberFlags(SDNode *N); bool tryShiftAmountMod(SDNode *N); bool tryShrinkShlLogicImm(SDNode *N); bool tryVPTERNLOG(SDNode *N); @@ -4225,7 +4225,7 @@ MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc, // the high bits in a useful state. There may be other situations where this // transformation is profitable given those conditions, but currently the // transformation is only made when it likely avoids spilling flags. -bool X86DAGToDAGISel::rightShiftUncloberFlags(SDNode *N) { +bool X86DAGToDAGISel::rightShiftUnclobberFlags(SDNode *N) { EVT VT = N->getValueType(0); // Target has to have BMI2 for RORX @@ -4251,7 +4251,6 @@ bool X86DAGToDAGISel::rightShiftUncloberFlags(SDNode *N) { unsigned TruncateSize = 0; // This only works when the result is truncated. for (const SDNode *User : N->uses()) { - auto name = User->getOperationName(CurDAG); if (!User->isMachineOpcode() || User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) return false; @@ -4274,25 +4273,24 @@ bool X86DAGToDAGISel::rightShiftUncloberFlags(SDNode *N) { if (!ShiftAmount || ShiftAmount->getZExtValue() > OpSize - TruncateSize) return false; - // Only make the replacement when it avoids clobbering used flags. This is a - // similar heuristic as used in the conversion to LEA, namely looking at the - // operand for an instruction that creates flags where those flags are used. - // This will have both false positives and false negatives. Ideally, both of - // these happen later on. Perhaps in copy to flags lowering or in register - // allocation. - bool MightClobberFlags = false; + // If the shift argument has non-dead EFLAGS, then this shift probably + // clobbers those flags making the transformation to RORX useful. This may + // have false negatives or positives so ideally this transformation is made + // later on. + bool ArgProducesFlags = false; SDNode *Input = N->getOperand(0).getNode(); for (auto Use : Input->uses()) { if (Use->getOpcode() == ISD::CopyToReg) { auto *RegisterNode = dyn_cast<RegisterSDNode>(Use->getOperand(1).getNode()); if (RegisterNode && RegisterNode->getReg() == X86::EFLAGS) { - MightClobberFlags = true; + ArgProducesFlags = true; break; } } } - if (!MightClobberFlags) + // Don't transform if the argument to this shift has dead EFLAGS. + if (!ArgProducesFlags) return false; // Make the replacement. @@ -5327,7 +5325,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { return; [[fallthrough]]; case ISD::SRA: - if (rightShiftUncloberFlags(Node)) + if (rightShiftUnclobberFlags(Node)) return; [[fallthrough]]; case ISD::SHL: >From f992595d835b03b07e3641662fa068d6ab594198 Mon Sep 17 00:00:00 2001 From: Bryce Wilson <br...@brycemw.ca> Date: Sun, 28 Jan 2024 12:51:53 -0600 Subject: [PATCH 16/16] Remove unneeded getNode --- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 028c49c0e4df6..8113e2ef5b49c 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -4281,8 +4281,7 @@ bool X86DAGToDAGISel::rightShiftUnclobberFlags(SDNode *N) { SDNode *Input = N->getOperand(0).getNode(); for (auto Use : Input->uses()) { if (Use->getOpcode() == ISD::CopyToReg) { - auto *RegisterNode = - dyn_cast<RegisterSDNode>(Use->getOperand(1).getNode()); + auto *RegisterNode = dyn_cast<RegisterSDNode>(Use->getOperand(1)); if (RegisterNode && RegisterNode->getReg() == X86::EFLAGS) { ArgProducesFlags = true; break; _______________________________________________ lldb-commits mailing list lldb-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/lldb-commits