llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: None (vangthao95) <details> <summary>Changes</summary> --- Patch is 298.22 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/176255.diff 5 Files Affected: - (modified) llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp (+8) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll (+785-1050) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll (+832-1085) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll (+331-220) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll (+331-220) ``````````diff diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index 5f5344b55ac35..f1810e219c7d2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -487,6 +487,14 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Uni(S32, {{Sgpr32, Sgpr32Trunc}, {Sgpr32, Sgpr32, Sgpr32AExtBoolInReg}}) .Div(S32, {{Vgpr32, Vcc}, {Vgpr32, Vgpr32, Vcc}}); + addRulesForGOpcs({G_UADDSAT, G_SADDSAT, G_USUBSAT, G_SSUBSAT}, Standard) + .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}) + .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}}) + .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}) + .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}) + .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16, VgprV2S16}}) + .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}}); + addRulesForGOpcs({G_MUL}, Standard).Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}); bool hasMulHi = ST->hasScalarMulHiInsts(); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll index 2f956d7a0a534..f6e36241a05dc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s define i7 @v_saddsat_i7(i7 %lhs, i7 %rhs) { ; GFX6-LABEL: v_saddsat_i7: @@ -98,8 +98,9 @@ define amdgpu_ps i7 @s_saddsat_i7(i7 inreg %lhs, i7 inreg %rhs) { ; GFX9-NEXT: s_lshl_b32 s0, s0, 9 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_add_i16 v0, s0, v0 clamp -; GFX9-NEXT: v_ashrrev_i16_e32 v0, 9, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: s_sext_i32_i16 s0, s0 +; GFX9-NEXT: s_ashr_i32 s0, s0, 9 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_saddsat_i7: @@ -107,8 +108,9 @@ define amdgpu_ps i7 @s_saddsat_i7(i7 inreg %lhs, i7 inreg %rhs) { ; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 9 ; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 9 ; GFX10PLUS-NEXT: v_add_nc_i16 v0, s0, s1 clamp -; GFX10PLUS-NEXT: v_ashrrev_i16 v0, 9, v0 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10PLUS-NEXT: s_sext_i32_i16 s0, s0 +; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 9 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = call i7 @llvm.sadd.sat.i7(i7 %lhs, i7 %rhs) ret i7 %result @@ -207,8 +209,9 @@ define amdgpu_ps i8 @s_saddsat_i8(i8 inreg %lhs, i8 inreg %rhs) { ; GFX9-NEXT: s_lshl_b32 s0, s0, 8 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_add_i16 v0, s0, v0 clamp -; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: s_sext_i32_i16 s0, s0 +; GFX9-NEXT: s_ashr_i32 s0, s0, 8 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_saddsat_i8: @@ -216,8 +219,9 @@ define amdgpu_ps i8 @s_saddsat_i8(i8 inreg %lhs, i8 inreg %rhs) { ; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 8 ; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8 ; GFX10PLUS-NEXT: v_add_nc_i16 v0, s0, s1 clamp -; GFX10PLUS-NEXT: v_ashrrev_i16 v0, 8, v0 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10PLUS-NEXT: s_sext_i32_i16 s0, s0 +; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 8 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = call i8 @llvm.sadd.sat.i8(i8 %lhs, i8 %rhs) ret i8 %result @@ -436,58 +440,48 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp -; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: v_mov_b32_e32 v1, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: s_sext_i32_i16 s1, s0 +; GFX9-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-NEXT: s_sext_i32_i16 s2, 0x80008 +; GFX9-NEXT: s_ashr_i32 s1, s1, s2 +; GFX9-NEXT: s_ashr_i32 s0, s0, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX9-NEXT: s_lshr_b32 s1, s0, 16 +; GFX9-NEXT: s_and_b32 s1, s1, 0xff +; GFX9-NEXT: s_and_b32 s0, s0, 0xff +; GFX9-NEXT: s_lshl_b32 s1, s1, 8 +; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_saddsat_v2i8: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_lshr_b32 s2, s0, 8 -; GFX10-NEXT: s_lshr_b32 s3, s1, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX10-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10-NEXT: s_lshr_b32 s3, s1, 16 -; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008 -; GFX10-NEXT: s_lshl_b32 s2, s2, 8 -; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008 -; GFX10-NEXT: s_lshl_b32 s3, s3, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX10-NEXT: v_mov_b32_e32 v1, 0xff -; GFX10-NEXT: v_pk_add_i16 v0, s0, s1 clamp -; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: ; return to shader part epilog -; -; GFX11-LABEL: s_saddsat_v2i8: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_lshr_b32 s2, s0, 8 -; GFX11-NEXT: s_lshr_b32 s3, s1, 8 -; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s2 -; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX11-NEXT: s_lshr_b32 s2, s0, 16 -; GFX11-NEXT: s_lshr_b32 s3, s1, 16 -; GFX11-NEXT: s_lshl_b32 s0, s0, 0x80008 -; GFX11-NEXT: s_lshl_b32 s2, s2, 8 -; GFX11-NEXT: s_lshl_b32 s1, s1, 0x80008 -; GFX11-NEXT: s_lshl_b32 s3, s3, 8 -; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s2 -; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX11-NEXT: v_pk_add_i16 v0, s0, s1 clamp -; GFX11-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 -; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_saddsat_v2i8: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_lshr_b32 s2, s0, 8 +; GFX10PLUS-NEXT: s_lshr_b32 s3, s1, 8 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s1, s1, s3 +; GFX10PLUS-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10PLUS-NEXT: s_lshr_b32 s3, s1, 16 +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX10PLUS-NEXT: s_lshl_b32 s2, s2, 8 +; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 0x80008 +; GFX10PLUS-NEXT: s_lshl_b32 s3, s3, 8 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s1, s1, s3 +; GFX10PLUS-NEXT: v_pk_add_i16 v0, s0, s1 clamp +; GFX10PLUS-NEXT: s_sext_i32_i16 s1, 0x80008 +; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10PLUS-NEXT: s_sext_i32_i16 s2, s0 +; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 16 +; GFX10PLUS-NEXT: s_ashr_i32 s1, s2, s1 +; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 8 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX10PLUS-NEXT: s_lshr_b32 s1, s0, 16 +; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xff +; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xff +; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8 +; GFX10PLUS-NEXT: s_or_b32 s0, s0, s1 +; GFX10PLUS-NEXT: ; return to shader part epilog %lhs = bitcast i16 %lhs.arg to <2 x i8> %rhs = bitcast i16 %rhs.arg to <2 x i8> %result = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> %lhs, <2 x i8> %rhs) @@ -886,66 +880,89 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX9-NEXT: s_lshr_b32 s4, s3, 16 ; GFX9-NEXT: s_lshl_b32 s3, s3, 0x80008 ; GFX9-NEXT: s_lshl_b32 s4, s4, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 ; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_pk_add_i16 v1, s2, v1 clamp -; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: v_mov_b32_e32 v3, 8 -; GFX9-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1] -; GFX9-NEXT: v_mov_b32_e32 v2, 0xff -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v3 -; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 24 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_pk_add_i16 v0, s2, v0 clamp +; GFX9-NEXT: s_sext_i32_i16 s2, s0 +; GFX9-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-NEXT: s_sext_i32_i16 s3, 0x80008 +; GFX9-NEXT: v_readfirstlane_b32 s1, v0 +; GFX9-NEXT: s_ashr_i32 s2, s2, s3 +; GFX9-NEXT: s_ashr_i32 s0, s0, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s2, s0 +; GFX9-NEXT: s_sext_i32_i16 s2, s1 +; GFX9-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-NEXT: s_ashr_i32 s2, s2, s3 +; GFX9-NEXT: s_ashr_i32 s1, s1, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s2, s1 +; GFX9-NEXT: s_and_b32 s2, s0, 0xff +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX9-NEXT: s_lshl_b32 s0, s0, 8 +; GFX9-NEXT: s_or_b32 s0, s2, s0 +; GFX9-NEXT: s_and_b32 s2, s1, 0xff +; GFX9-NEXT: s_lshl_b32 s2, s2, 16 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX9-NEXT: s_or_b32 s0, s0, s2 +; GFX9-NEXT: s_lshl_b32 s1, s1, 24 +; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_saddsat_v4i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_lshr_b32 s2, s0, 8 +; GFX10-NEXT: s_lshr_b32 s5, s1, 8 ; GFX10-NEXT: s_lshr_b32 s3, s0, 16 ; GFX10-NEXT: s_lshr_b32 s4, s0, 24 -; GFX10-NEXT: s_lshr_b32 s5, s1, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4 ; GFX10-NEXT: s_lshr_b32 s6, s1, 16 ; GFX10-NEXT: s_lshr_b32 s7, s1, 24 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX10-NEXT: s_lshr_b32 s4, s0, 16 -; GFX10-NEXT: s_lshr_b32 s5, s2, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s6, s7 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4 +; GFX10-NEXT: s_lshr_b32 s3, s0, 16 +; GFX10-NEXT: s_lshr_b32 s5, s1, 16 ; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008 -; GFX10-NEXT: s_lshl_b32 s4, s4, 8 -; GFX10-NEXT: s_lshl_b32 s2, s2, 0x80008 -; GFX10-NEXT: s_lshl_b32 s5, s5, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s5 -; GFX10-NEXT: s_lshr_b32 s4, s1, 16 -; GFX10-NEXT: s_lshr_b32 s5, s3, 16 +; GFX10-NEXT: s_lshl_b32 s3, s3, 8 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s6, s7 ; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008 -; GFX10-NEXT: s_lshl_b32 s4, s4, 8 -; GFX10-NEXT: s_lshl_b32 s3, s3, 0x80008 ; GFX10-NEXT: s_lshl_b32 s5, s5, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX10-NEXT: s_lshr_b32 s3, s2, 16 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 +; GFX10-NEXT: s_lshr_b32 s5, s4, 16 +; GFX10-NEXT: s_lshl_b32 s2, s2, 0x80008 +; GFX10-NEXT: s_lshl_b32 s3, s3, 8 +; GFX10-NEXT: s_lshl_b32 s4, s4, 0x80008 +; GFX10-NEXT: s_lshl_b32 s5, s5, 8 ; GFX10-NEXT: v_pk_add_i16 v0, s0, s1 clamp -; GFX10-NEXT: v_pk_add_i16 v1, s2, s3 clamp -; GFX10-NEXT: v_mov_b32_e32 v2, 8 -; GFX10-NEXT: v_mov_b32_e32 v4, 24 -; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1] -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, 0xff, v0, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s2, s3 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s4, s5 +; GFX10-NEXT: s_sext_i32_i16 s3, 0x80008 +; GFX10-NEXT: v_pk_add_i16 v1, s0, s1 clamp ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: s_sext_i32_i16 s2, s0 +; GFX10-NEXT: s_ashr_i32 s0, s0, 16 +; GFX10-NEXT: s_ashr_i32 s2, s2, s3 +; GFX10-NEXT: s_ashr_i32 s0, s0, 8 +; GFX10-NEXT: s_sext_i32_i16 s4, s1 +; GFX10-NEXT: s_ashr_i32 s1, s1, 16 +; GFX10-NEXT: s_ashr_i32 s3, s4, s3 +; GFX10-NEXT: s_ashr_i32 s1, s1, 8 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s2, s0 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s3, s1 +; GFX10-NEXT: s_bfe_u32 s2, s0, 0x80010 +; GFX10-NEXT: s_and_b32 s0, s0, 0xff +; GFX10-NEXT: s_lshl_b32 s2, s2, 8 +; GFX10-NEXT: s_and_b32 s3, s1, 0xff +; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: s_lshl_b32 s2, s3, 16 +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: s_lshl_b32 s1, s1, 24 +; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_saddsat_v4i8: @@ -965,28 +982,40 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX11-NEXT: s_lshl_b32 s3, s3, 0x80008 ; GFX11-NEXT: s_lshl_b32 s5, s5, 8 ; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s4 -; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s5 ; GFX11-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s5 ; GFX11-NEXT: s_lshr_b32 s5, s1, 16 -; GFX11-NEXT: v_pk_add_i16 v0, s2, s3 clamp ; GFX11-NEXT: s_lshl_b32 s0, s0, 0x80008 ; GFX11-NEXT: s_lshl_b32 s4, s4, 8 ; GFX11-NEXT: s_lshl_b32 s1, s1, 0x80008 -; GFX11-NEXT: s_lshl_b32 s2, s5, 8 +; GFX11-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-NEXT: v_pk_add_i16 v0, s2, s3 clamp ; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s2 -; GFX11-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] +; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s5 +; GFX11-NEXT: s_sext_i32_i16 s3, 0x80008 ; GFX11-NEXT: v_pk_add_i16 v1, s0, s1 clamp -; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 8 -; GFX11-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v1 -; GFX11-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-NEXT: s_sext_i32_i16 s2, s0 +; GFX11-NEXT: s_ashr_i32 s0, s0, 16 +; GFX11-NEXT: s_ashr_i32 s2, s2, s3 +; GFX11-NEXT: s_ashr_i32 s0, s0, 8 +; GFX11-NEXT: s_sext_i32_i16 s4, s1 +; GFX11-NEXT: s_ashr_i32 s1, s1, 16 +; GFX11-NEXT: s_ashr_i32 s3, s4, s3 +; GFX11-NEXT: s_ashr_i32 s1, s1, 8 +; GFX11-NEXT: s_pack_ll_b32_b16 s0, s2, s0 +; GFX11-NEXT: s_pack_ll_b32_b16 s1, s3, s1 +; GFX11-NEXT: s_bfe_u32 s2, s0, 0x80010 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-NEXT: s_and_b32 s3, s1, 0xff +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: s_lshl_b32 s2, s3, 16 +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: s_lshl_b32 s1, s1, 24 +; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: ; return to shader part epilog %lhs = bitcast i32 %lhs.arg to <4 x i8> %rhs = bitcast i32 %rhs.arg to <4 x i8> @@ -1085,8 +1114,8 @@ define amdgpu_ps i24 @s_saddsat_i24(i24 inreg %lhs, i24 inreg %rhs) { ; GFX9-NEXT: s_lshl_b32 s0, s0, 8 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_add_i32 v0, s0, v0 clamp -; GFX9-NEXT: v_ashrrev_i32_e32 v0, 8, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: s_ashr_i32 s0, s0, 8 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_saddsat_i24: @@ -1094,8 +1123,8 @@ define amdgpu_ps i24 @s_saddsat_i24(i24 inreg %lhs, i24 inreg %rhs) { ; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 8 ; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8 ; GFX10PLUS-NEXT: v_add_nc_i32 v0, s0, s1 clamp -; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v0, 8, v0 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 8 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = call i24 @llvm.sadd.sat.i24(i24 %lhs, i24 %rhs) ret i24 %result @@ -4090,9 +4119,9 @@ define i48 @v_saddsat_i48(i48 %lhs, i48 %rhs) { ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v0, v2 ; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v1, v3, vcc -; GFX6-NEXT: v_bfe_i32 v5, v4, 0, 16 -; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 16 -; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16 +; GFX6-NEXT: v_bfe_i32 v5, v6, 0, 16 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 ; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[2:3] ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v5 @@ -4108,9 +4137,9 @@ define i48 @v_saddsat_i48(i48 %lhs, i48 %rhs) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v1, v3, vcc -; GFX8-NEXT: v_bfe_i32 v5, v4, 0, 16 -; GFX8-NEXT: v_bfe_i32 v1, v0, 0, 16 -; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 16 +; GFX8-NEXT: v_bfe_i32 v5, v6, 0, 16 +; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 16 ; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[2:3] ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v5 @@ -4178,51 +4207,49 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX6-LABEL: s_saddsat_i48: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_add_u32 s4, s0, s2 -; GFX6-NEXT: s_addc_u32 s3, s1, s3 +; GFX6-NEXT: s_addc_u32 s5, s1, s3 ; GFX6-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: s_bfe_i64 s[6:7], s[4:5], 0x300000 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x300000 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] +; GFX6-NEXT: s_or_b64 s[0:1], vcc, vcc +; GFX6-NEXT: s_cselect_... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/176255 _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
