Hi, The attached patchset implements a few optimizations for the bfgminer bitcoin mining program.
Please Review. -Tom
>From 661e832408a8bafc03a7c4c600c4a140b03054b4 Mon Sep 17 00:00:00 2001 From: Dmitry Cherkassov <dcherkas...@gmail.com> Date: Thu, 7 Mar 2013 20:17:59 +0400 Subject: [PATCH 1/3] R600: Add 64-bit load/store support * Added R600_Reg64 class * Added T#Index#.XY registers definition * Added v2i32 register reads from parameter and global space * Added f32 and i32 elements extraction from v2f32 and v2i32 * Added v2i32 -> v2f32 conversions Signed-off-by: Dmitry Cherkassov <dcherkas...@gmail.com> Tom Stellard: - Mark vec2 operations as expand. The addition of a vec2 register class made them all legal. --- lib/Target/R600/AMDGPUISelLowering.cpp | 6 +++ lib/Target/R600/AMDILISelDAGToDAG.cpp | 10 ++++- lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp | 3 ++ lib/Target/R600/R600ISelLowering.cpp | 17 +++++++++ lib/Target/R600/R600InstrInfo.cpp | 19 ++++++---- lib/Target/R600/R600Instructions.td | 44 ++++++++++++++++++++++ lib/Target/R600/R600RegisterInfo.td | 16 ++++++++ test/CodeGen/R600/64bit-kernel-args.ll | 41 ++++++++++++++++++++ test/CodeGen/R600/fadd.ll | 10 +++++ test/CodeGen/R600/fdiv.ll | 37 +++++++++++++----- test/CodeGen/R600/fmul.ll | 10 +++++ test/CodeGen/R600/fp_to_sint.ll | 10 +++++ test/CodeGen/R600/fp_to_uint.ll | 10 +++++ test/CodeGen/R600/fsub.ll | 20 +++++++--- test/CodeGen/R600/setcc.ll | 18 +++++++-- test/CodeGen/R600/sint_to_fp.ll | 10 +++++ test/CodeGen/R600/udiv.ll | 20 +++++++--- test/CodeGen/R600/uint_to_fp.ll | 10 +++++ test/CodeGen/R600/urem.ll | 21 ++++++++--- 19 files changed, 292 insertions(+), 40 deletions(-) create mode 100644 test/CodeGen/R600/64bit-kernel-args.ll diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index a266df5..4a064b1 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -51,6 +51,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::STORE, MVT::f32, Promote); AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32); + setOperationAction(ISD::STORE, MVT::v2f32, Promote); + AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32); + setOperationAction(ISD::STORE, MVT::v4f32, Promote); AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32); @@ -60,6 +63,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::LOAD, MVT::v4f32, Promote); AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32); + setOperationAction(ISD::LOAD, MVT::v2f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32); + setOperationAction(ISD::MUL, MVT::i64, Expand); setOperationAction(ISD::UDIV, MVT::i32, Expand); diff --git a/lib/Target/R600/AMDILISelDAGToDAG.cpp b/lib/Target/R600/AMDILISelDAGToDAG.cpp index ba75a44..198cd7e 100644 --- a/lib/Target/R600/AMDILISelDAGToDAG.cpp +++ b/lib/Target/R600/AMDILISelDAGToDAG.cpp @@ -167,12 +167,20 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) { break; } + unsigned RegSequenceClassID; + EVT VT = N->getValueType(0); + assert(VT.isVector()); + switch (VT.getVectorNumElements()) { + case 4: RegSequenceClassID = AMDGPU::R600_Reg128RegClassID; break; + case 2: RegSequenceClassID = AMDGPU::R600_Reg64RegClassID; break; + default: llvm_unreachable("Unhandled vector width in BUILD_VECTOR"); + } // BUILD_VECTOR is usually lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG // that adds a 128 bits reg copy when going through TwoAddressInstructions // pass. We want to avoid 128 bits copies as much as possible because they // can't be bundled by our scheduler. SDValue RegSeqArgs[9] = { - CurDAG->getTargetConstant(AMDGPU::R600_Reg128RegClassID, MVT::i32), + CurDAG->getTargetConstant(RegSequenceClassID, MVT::i32), SDValue(), CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32), SDValue(), CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32), SDValue(), CurDAG->getTargetConstant(AMDGPU::sub2, MVT::i32), diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp index 7c83d86..030fc87 100644 --- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp +++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp @@ -150,6 +150,7 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS, } else { switch(MI.getOpcode()) { case AMDGPU::RAT_WRITE_CACHELESS_32_eg: + case AMDGPU::RAT_WRITE_CACHELESS_64_eg: case AMDGPU::RAT_WRITE_CACHELESS_128_eg: { uint64_t inst = getBinaryCodeForInstr(MI, Fixups); EmitByte(INSTR_NATIVE, OS); @@ -160,9 +161,11 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS, case AMDGPU::VTX_READ_PARAM_8_eg: case AMDGPU::VTX_READ_PARAM_16_eg: case AMDGPU::VTX_READ_PARAM_32_eg: + case AMDGPU::VTX_READ_PARAM_64_eg: case AMDGPU::VTX_READ_PARAM_128_eg: case AMDGPU::VTX_READ_GLOBAL_8_eg: case AMDGPU::VTX_READ_GLOBAL_32_eg: + case AMDGPU::VTX_READ_GLOBAL_64_eg: case AMDGPU::VTX_READ_GLOBAL_128_eg: case AMDGPU::TEX_VTX_CONSTBUF: case AMDGPU::TEX_VTX_TEXBUF : { diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp index a66baca..b6b7c32 100644 --- a/lib/Target/R600/R600ISelLowering.cpp +++ b/lib/Target/R600/R600ISelLowering.cpp @@ -32,22 +32,38 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) : addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass); addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass); addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass); + addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass); + addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass); + computeRegisterProperties(); setOperationAction(ISD::FADD, MVT::v4f32, Expand); + setOperationAction(ISD::FADD, MVT::v2f32, Expand); setOperationAction(ISD::FMUL, MVT::v4f32, Expand); + setOperationAction(ISD::FMUL, MVT::v2f32, Expand); setOperationAction(ISD::FDIV, MVT::v4f32, Expand); + setOperationAction(ISD::FDIV, MVT::v2f32, Expand); setOperationAction(ISD::FSUB, MVT::v4f32, Expand); + setOperationAction(ISD::FSUB, MVT::v2f32, Expand); setOperationAction(ISD::ADD, MVT::v4i32, Expand); + setOperationAction(ISD::ADD, MVT::v2i32, Expand); setOperationAction(ISD::AND, MVT::v4i32, Expand); + setOperationAction(ISD::AND, MVT::v2i32, Expand); setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Expand); + setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Expand); setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Expand); + setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Expand); setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Expand); + setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Expand); setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Expand); + setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Expand); setOperationAction(ISD::UDIV, MVT::v4i32, Expand); + setOperationAction(ISD::UDIV, MVT::v2i32, Expand); setOperationAction(ISD::UREM, MVT::v4i32, Expand); + setOperationAction(ISD::UREM, MVT::v2i32, Expand); setOperationAction(ISD::SETCC, MVT::v4i32, Expand); + setOperationAction(ISD::SETCC, MVT::v2i32, Expand); setOperationAction(ISD::BR_CC, MVT::i32, Expand); setOperationAction(ISD::BR_CC, MVT::f32, Expand); @@ -158,6 +174,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( } case AMDGPU::RAT_WRITE_CACHELESS_32_eg: + case AMDGPU::RAT_WRITE_CACHELESS_64_eg: case AMDGPU::RAT_WRITE_CACHELESS_128_eg: { unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0; diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp index 85288c9..eb09665 100644 --- a/lib/Target/R600/R600InstrInfo.cpp +++ b/lib/Target/R600/R600InstrInfo.cpp @@ -50,9 +50,17 @@ R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, DebugLoc DL, unsigned DestReg, unsigned SrcReg, bool KillSrc) const { - if (AMDGPU::R600_Reg128RegClass.contains(DestReg) - && AMDGPU::R600_Reg128RegClass.contains(SrcReg)) { - for (unsigned I = 0; I < 4; I++) { + unsigned VectorComponents = 0; + if (AMDGPU::R600_Reg128RegClass.contains(DestReg) && + AMDGPU::R600_Reg128RegClass.contains(SrcReg)) { + VectorComponents = 4; + } else if(AMDGPU::R600_Reg64RegClass.contains(DestReg) && + AMDGPU::R600_Reg64RegClass.contains(SrcReg)) { + VectorComponents = 2; + } + + if (VectorComponents > 0) { + for (unsigned I = 0; I < VectorComponents; I++) { unsigned SubRegIndex = RI.getSubRegFromChannel(I); buildDefaultInstruction(MBB, MI, AMDGPU::MOV, RI.getSubReg(DestReg, SubRegIndex), @@ -61,11 +69,6 @@ R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB, RegState::Define | RegState::Implicit); } } else { - - // We can't copy vec4 registers - assert(!AMDGPU::R600_Reg128RegClass.contains(DestReg) - && !AMDGPU::R600_Reg128RegClass.contains(SrcReg)); - MachineInstr *NewMI = buildDefaultInstruction(MBB, MI, AMDGPU::MOV, DestReg, SrcReg); NewMI->getOperand(getOperandIdx(*NewMI, R600Operands::SRC0)) diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index ea8ee05..b1e8d1c 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600/R600Instructions.td @@ -1794,6 +1794,13 @@ def RAT_WRITE_CACHELESS_32_eg : RAT_WRITE_CACHELESS_eg < [(global_store (i32 R600_TReg32_X:$rw_gpr), R600_TReg32_X:$index_gpr)] >; +// 64-bit store +def RAT_WRITE_CACHELESS_64_eg : RAT_WRITE_CACHELESS_eg < + (ins R600_Reg64:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop), + 0x3, "RAT_WRITE_CACHELESS_64_eg", + [(global_store (v2i32 R600_Reg64:$rw_gpr), R600_TReg32_X:$index_gpr)] +>; + //128-bit store def RAT_WRITE_CACHELESS_128_eg : RAT_WRITE_CACHELESS_eg < (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop), @@ -1901,6 +1908,18 @@ class VTX_READ_32_eg <bits<8> buffer_id, list<dag> pattern> let Constraints = "$ptr.ptr = $dst"; } +class VTX_READ_64_eg <bits<8> buffer_id, list<dag> pattern> + : VTX_READ_eg <"VTX_READ_64", buffer_id, (outs R600_Reg64:$dst), + pattern> { + + let MEGA_FETCH_COUNT = 8; + let DST_SEL_X = 0; + let DST_SEL_Y = 1; + let DST_SEL_Z = 7; + let DST_SEL_W = 7; + let DATA_FORMAT = 0x1D; // COLOR_32_32 +} + class VTX_READ_128_eg <bits<8> buffer_id, list<dag> pattern> : VTX_READ_eg <"VTX_READ_128", buffer_id, (outs R600_Reg128:$dst), pattern> { @@ -1934,6 +1953,11 @@ def VTX_READ_PARAM_32_eg : VTX_READ_32_eg <0, [(set (i32 R600_TReg32_X:$dst), (load_param ADDRVTX_READ:$ptr))] >; +def VTX_READ_PARAM_64_eg : VTX_READ_64_eg <0, + [(set (v2i32 R600_Reg64:$dst), (load_param ADDRVTX_READ:$ptr))] +>; + + def VTX_READ_PARAM_128_eg : VTX_READ_128_eg <0, [(set (v4i32 R600_Reg128:$dst), (load_param ADDRVTX_READ:$ptr))] >; @@ -1952,6 +1976,12 @@ def VTX_READ_GLOBAL_32_eg : VTX_READ_32_eg <1, [(set (i32 R600_TReg32_X:$dst), (global_load ADDRVTX_READ:$ptr))] >; +// 64-bit reads +def VTX_READ_GLOBAL_64_eg : VTX_READ_64_eg <1, + [(set (v2i32 R600_Reg64:$dst), (global_load ADDRVTX_READ:$ptr))] +>; + + // 128-bit reads def VTX_READ_GLOBAL_128_eg : VTX_READ_128_eg <1, [(set (v4i32 R600_Reg128:$dst), (global_load ADDRVTX_READ:$ptr))] @@ -2439,10 +2469,24 @@ def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 3, sub3>; def : Vector4_Build <v4f32, R600_Reg128, f32, R600_Reg32>; def : Vector4_Build <v4i32, R600_Reg128, i32, R600_Reg32>; +def : Extract_Element <f32, v2f32, R600_Reg64, 0, sub0>; +def : Extract_Element <f32, v2f32, R600_Reg64, 1, sub1>; + +def : Insert_Element <f32, v2f32, R600_Reg32, R600_Reg64, 0, sub0>; +def : Insert_Element <f32, v2f32, R600_Reg32, R600_Reg64, 1, sub1>; + +def : Extract_Element <i32, v2i32, R600_Reg64, 0, sub0>; +def : Extract_Element <i32, v2i32, R600_Reg64, 1, sub1>; + +def : Insert_Element <i32, v2i32, R600_Reg32, R600_Reg64, 0, sub0>; +def : Insert_Element <i32, v2i32, R600_Reg32, R600_Reg64, 1, sub1>; + // bitconvert patterns def : BitConvert <i32, f32, R600_Reg32>; def : BitConvert <f32, i32, R600_Reg32>; +def : BitConvert <v2f32, v2i32, R600_Reg64>; +def : BitConvert <v2i32, v2f32, R600_Reg64>; def : BitConvert <v4f32, v4i32, R600_Reg128>; def : BitConvert <v4i32, v4f32, R600_Reg128>; diff --git a/lib/Target/R600/R600RegisterInfo.td b/lib/Target/R600/R600RegisterInfo.td index 5a2e65c..6bde923 100644 --- a/lib/Target/R600/R600RegisterInfo.td +++ b/lib/Target/R600/R600RegisterInfo.td @@ -23,6 +23,14 @@ class R600Reg_128<string n, list<Register> subregs, bits<16> encoding> : let HWEncoding = encoding; } +class R600Reg_64<string n, list<Register> subregs, bits<16> encoding> : + RegisterWithSubRegs<n, subregs> { + let Namespace = "AMDGPU"; + let SubRegIndices = [sub0, sub1]; + let HWEncoding = encoding; +} + + foreach Index = 0-127 in { foreach Chan = [ "X", "Y", "Z", "W" ] in { // 32-bit Temporary Registers @@ -41,6 +49,11 @@ foreach Index = 0-127 in { !cast<Register>("T"#Index#"_Z"), !cast<Register>("T"#Index#"_W")], Index>; + + def T#Index#_XY : R600Reg_64 <"T"#Index#".XY", + [!cast<Register>("T"#Index#"_X"), + !cast<Register>("T"#Index#"_Y")], + Index>; } // KCACHE_BANK0 @@ -184,6 +197,9 @@ def R600_Reg128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128, let CopyCost = -1; } +def R600_Reg64 : RegisterClass<"AMDGPU", [v2f32, v2i32], 64, + (add (sequence "T%u_XY", 0, 63))>; + //===----------------------------------------------------------------------===// // Register classes for indirect addressing //===----------------------------------------------------------------------===// diff --git a/test/CodeGen/R600/64bit-kernel-args.ll b/test/CodeGen/R600/64bit-kernel-args.ll new file mode 100644 index 0000000..6f03b68 --- /dev/null +++ b/test/CodeGen/R600/64bit-kernel-args.ll @@ -0,0 +1,41 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; CHECK: @v2i32_load_extract_store +; CHECK: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 40 +define void @v2i32_load_extract_store(i32 addrspace(1)* nocapture %out, <2 x i32> %in) { +entry: + %0 = extractelement <2 x i32> %in, i32 0 + store i32 %0, i32 addrspace(1)* %out, align 4 + %1 = extractelement <2 x i32> %in, i32 1 + %arrayidx1 = getelementptr inbounds i32 addrspace(1)* %out, i32 1 + store i32 %1, i32 addrspace(1)* %arrayidx1, align 4 + ret void +} + +; CHECK: @v2f32_load_extract_store +; CHECK: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 40 +define void @v2f32_load_extract_store(float addrspace(1)* nocapture %out, <2 x float> %in) { +entry: + %0 = extractelement <2 x float> %in, i32 0 + store float %0, float addrspace(1)* %out, align 4 + %1 = extractelement <2 x float> %in, i32 1 + %arrayidx1 = getelementptr inbounds float addrspace(1)* %out, i32 1 + store float %1, float addrspace(1)* %arrayidx1, align 4 + ret void +} + +; CHECK: @v2i32_load_store +; CHECK: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 40 +define void @v2i32_load_store(<2 x i32> addrspace(1)* %out, <2 x i32> %in) { +entry: + store <2 x i32> %in, <2 x i32> addrspace(1)* %out + ret void +} + +; CHECK: @v2f32_load_store +; CHECK: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 40 +define void @v2f32_load_store(<2 x float> addrspace(1)* %out, <2 x float> %in) { +entry: + store <2 x float> %in, <2 x float> addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/R600/fadd.ll b/test/CodeGen/R600/fadd.ll index 81a4fa5..1e51c35 100644 --- a/test/CodeGen/R600/fadd.ll +++ b/test/CodeGen/R600/fadd.ll @@ -15,6 +15,16 @@ declare float @llvm.R600.load.input(i32) readnone declare void @llvm.AMDGPU.store.output(float, i32) +; CHECK: @fadd_v2f32 +; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +define void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) { +entry: + %0 = fadd <2 x float> %a, %b + store <2 x float> %0, <2 x float> addrspace(1)* %out + ret void +} + ; CHECK: @fadd_v4f32 ; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} diff --git a/test/CodeGen/R600/fdiv.ll b/test/CodeGen/R600/fdiv.ll index 79e677f..240f1e5 100644 --- a/test/CodeGen/R600/fdiv.ll +++ b/test/CodeGen/R600/fdiv.ll @@ -1,15 +1,32 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s -;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; These tests check that fdiv is expanded correctly and also test that the +; scheduler is scheduling the RECIP_IEEE and MUL_IEEE instructions in separate +; instruction groups. -define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { +; CHECK: @fdiv_v2f32 +; CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) { +entry: + %0 = fdiv <2 x float> %a, %b + store <2 x float> %0, <2 x float> addrspace(1)* %out + ret void +} + +; CHECK: @fdiv_v4f32 +; CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1 %a = load <4 x float> addrspace(1) * %in %b = load <4 x float> addrspace(1) * %b_ptr diff --git a/test/CodeGen/R600/fmul.ll b/test/CodeGen/R600/fmul.ll index 7fd22d8..74c277d 100644 --- a/test/CodeGen/R600/fmul.ll +++ b/test/CodeGen/R600/fmul.ll @@ -15,6 +15,16 @@ declare float @llvm.R600.load.input(i32) readnone declare void @llvm.AMDGPU.store.output(float, i32) +; CHECK: @fmul_v2f32 +; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +define void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) { +entry: + %0 = fmul <2 x float> %a, %b + store <2 x float> %0, <2 x float> addrspace(1)* %out + ret void +} + ; CHECK: @fmul_v4f32 ; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} diff --git a/test/CodeGen/R600/fp_to_sint.ll b/test/CodeGen/R600/fp_to_sint.ll index 9c21ad2..dabfe41 100644 --- a/test/CodeGen/R600/fp_to_sint.ll +++ b/test/CodeGen/R600/fp_to_sint.ll @@ -1,5 +1,15 @@ ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s +; CHECK: @fp_to_sint_v2i32 +; CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +define void @fp_to_sint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) { + %result = fptosi <2 x float> %in to <2 x i32> + store <2 x i32> %result, <2 x i32> addrspace(1)* %out + ret void +} + ; CHECK: @fp_to_sint_v4i32 ; CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} diff --git a/test/CodeGen/R600/fp_to_uint.ll b/test/CodeGen/R600/fp_to_uint.ll index d91098f..95c62f7 100644 --- a/test/CodeGen/R600/fp_to_uint.ll +++ b/test/CodeGen/R600/fp_to_uint.ll @@ -1,5 +1,15 @@ ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s +; CHECK: @fp_to_uint_v2i32 +; CHECK: FLT_TO_UINT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; CHECK: FLT_TO_UINT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +define void @fp_to_uint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) { + %result = fptoui <2 x float> %in to <2 x i32> + store <2 x i32> %result, <2 x i32> addrspace(1)* %out + ret void +} + ; CHECK: @fp_to_uint_v4i32 ; CHECK: FLT_TO_UINT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; CHECK: FLT_TO_UINT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} diff --git a/test/CodeGen/R600/fsub.ll b/test/CodeGen/R600/fsub.ll index 812388b..f93212c 100644 --- a/test/CodeGen/R600/fsub.ll +++ b/test/CodeGen/R600/fsub.ll @@ -2,7 +2,6 @@ ; CHECK: @fsub_f32 ; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}} - define void @fsub_f32() { %r0 = call float @llvm.R600.load.input(i32 0) %r1 = call float @llvm.R600.load.input(i32 1) @@ -15,12 +14,21 @@ declare float @llvm.R600.load.input(i32) readnone declare void @llvm.AMDGPU.store.output(float, i32) -; CHECK: @fsub_v4f32 -; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; CHECK: @fsub_v2f32 +; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}} +; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}} +define void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) { +entry: + %0 = fsub <2 x float> %a, %b + store <2 x float> %0, <2 x float> addrspace(1)* %out + ret void +} +; CHECK: @fsub_v4f32 +; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}} +; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}} +; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}} +; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}} define void @fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1 %a = load <4 x float> addrspace(1) * %in diff --git a/test/CodeGen/R600/setcc.ll b/test/CodeGen/R600/setcc.ll index 0752f2e..ba8fca7 100644 --- a/test/CodeGen/R600/setcc.ll +++ b/test/CodeGen/R600/setcc.ll @@ -1,7 +1,19 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s -;CHECK: SETE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s -define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { +; CHECK: @setcc_v2i32 +; CHECK: SETE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +define void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) { + %result = icmp eq <2 x i32> %a, %b + %sext = sext <2 x i1> %result to <2 x i32> + store <2 x i32> %sext, <2 x i32> addrspace(1)* %out + ret void +} + +; CHECK: @setcc_v4i32 +; CHECK: SETE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +define void @setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1 %a = load <4 x i32> addrspace(1) * %in %b = load <4 x i32> addrspace(1) * %b_ptr diff --git a/test/CodeGen/R600/sint_to_fp.ll b/test/CodeGen/R600/sint_to_fp.ll index 6a56db3..dc163da 100644 --- a/test/CodeGen/R600/sint_to_fp.ll +++ b/test/CodeGen/R600/sint_to_fp.ll @@ -1,5 +1,15 @@ ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s +; CHECK: @sint_to_fp_v2i32 +; CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +define void @sint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) { + %result = sitofp <2 x i32> %in to <2 x float> + store <2 x float> %result, <2 x float> addrspace(1)* %out + ret void +} + ; CHECK: @sint_to_fp_v4i32 ; CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} diff --git a/test/CodeGen/R600/udiv.ll b/test/CodeGen/R600/udiv.ll index b81e366..0e91b2b 100644 --- a/test/CodeGen/R600/udiv.ll +++ b/test/CodeGen/R600/udiv.ll @@ -1,11 +1,19 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s -;The code generated by udiv is long and complex and may frequently change. -;The goal of this test is to make sure the ISel doesn't fail when it gets -;a v4i32 udiv -;CHECK: CF_END +; The code generated by udiv is long and complex and may frequently change. +; The goal of these tests is to make sure the ISel doesn't fail on udiv -define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { +; CHECK: @udiv_v2i32 +; CHECK: CF_END +define void @udiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) { + %result = udiv <2 x i32> %a, %b + store <2 x i32> %result, <2 x i32> addrspace(1)* %out + ret void +} + +; CHECK: @udiv_v4i32 +; CHECK: CF_END +define void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1 %a = load <4 x i32> addrspace(1) * %in %b = load <4 x i32> addrspace(1) * %b_ptr diff --git a/test/CodeGen/R600/uint_to_fp.ll b/test/CodeGen/R600/uint_to_fp.ll index ae8fc8e..791f117 100644 --- a/test/CodeGen/R600/uint_to_fp.ll +++ b/test/CodeGen/R600/uint_to_fp.ll @@ -1,5 +1,15 @@ ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s +; CHECK: @uint_to_fp_v2i32 +; CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +define void @uint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) { + %result = uitofp <2 x i32> %in to <2 x float> + store <2 x float> %result, <2 x float> addrspace(1)* %out + ret void +} + ; CHECK: @uint_to_fp_v4i32 ; CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} diff --git a/test/CodeGen/R600/urem.ll b/test/CodeGen/R600/urem.ll index a2cc0bd..1a50e65 100644 --- a/test/CodeGen/R600/urem.ll +++ b/test/CodeGen/R600/urem.ll @@ -1,11 +1,20 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s -;The code generated by urem is long and complex and may frequently change. -;The goal of this test is to make sure the ISel doesn't fail when it gets -;a v4i32 urem -;CHECK: CF_END +; The code generated by urem is long and complex and may frequently change. +; The goal of these tests is to make sure the ISel doesn't fail when it gets +; a urem. -define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { +; CHECK: @urem_v2i32 +; CHECK: CF_END +define void @urem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) { + %result = urem <2 x i32> %a, %b + store <2 x i32> %result, <2 x i32> addrspace(1)* %out + ret void +} + +; CHECK: @urem_v4i32 +; CHECK: CF_END +define void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1 %a = load <4 x i32> addrspace(1) * %in %b = load <4 x i32> addrspace(1) * %b_ptr -- 1.8.1.5
>From b4a725a06bc00ec00c8d13e207d187d0fcca2ea2 Mon Sep 17 00:00:00 2001 From: Tom Stellard <thomas.stell...@amd.com> Date: Wed, 17 Apr 2013 11:55:44 -0400 Subject: [PATCH 2/3] R600: Add pattern for SHA-256 Ma function This can be optimized using the BFI_INT instruction. --- lib/Target/R600/AMDGPUInstructions.td | 8 ++++++++ lib/Target/R600/R600Instructions.td | 3 +++ lib/Target/R600/SIInstructions.td | 6 ++++++ test/CodeGen/R600/bfi_int.ll | 18 ++++++++++++++++++ 4 files changed, 35 insertions(+) diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td index 4b37a53..d801b32 100644 --- a/lib/Target/R600/AMDGPUInstructions.td +++ b/lib/Target/R600/AMDGPUInstructions.td @@ -281,6 +281,14 @@ multiclass BFIPatterns <Instruction BFI_INT> { } +// SHA-256 Ma patterns + +// ((x & z) | (y & (x | z))) -> BFI_INT (XOR x, y), z, y +class SHA256MaPattern <Instruction BFI_INT, Instruction XOR> : Pat < + (or (and i32:$x, i32:$z), (and i32:$y, (or i32:$x, i32:$z))), + (BFI_INT (XOR i32:$x, i32:$y), i32:$z, i32:$y) +>; + include "R600Instructions.td" include "SIInstrInfo.td" diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index b1e8d1c..1630936 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600/R600Instructions.td @@ -1691,6 +1691,9 @@ let hasSideEffects = 1 in { def : Pat<(fp_to_uint R600_Reg32:$src0), (FLT_TO_UINT_eg (TRUNC R600_Reg32:$src0))>; + // SHA-256 Patterns + def : SHA256MaPattern <BFI_INT_eg, XOR_INT>; + def EG_ExportSwz : ExportSwzInst { let Word1{19-16} = 0; // BURST_COUNT let Word1{20} = 1; // VALID_PIXEL_MODE diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td index 9faf89b..b6217e6 100644 --- a/lib/Target/R600/SIInstructions.td +++ b/lib/Target/R600/SIInstructions.td @@ -1617,4 +1617,10 @@ def : Pat< (V_CMP_U_F32_e64 f32:$src0, f32:$src1) >; +//============================================================================// +// Miscellaneous Optimization Patterns +//============================================================================// + +def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e32>; + } // End isSI predicate diff --git a/test/CodeGen/R600/bfi_int.ll b/test/CodeGen/R600/bfi_int.ll index c9015a6..abe2591 100644 --- a/test/CodeGen/R600/bfi_int.ll +++ b/test/CodeGen/R600/bfi_int.ll @@ -32,3 +32,21 @@ entry: store i32 %2, i32 addrspace(1)* %out ret void } + +; SHA-256 Ma function +; ((x & z) | (y & (x | z))) +; R600-CHECK: @bfi_sha256_ma +; R600-CHECK: XOR_INT [[DST:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600-CHECK: BFI_INT {{T[0-9]+\.[XYZW]}}, [[DST]], {{T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; SI-CHECK: V_XOR_B32_e32 [[DST:VGPR[0-9]+]], {{VGPR[0-9]+, VGPR[0-9]+}} +; SI-CHECK: V_BFI_B32 {{VGPR[0-9]+}}, [[DST]], {{VGPR[0-9]+, VGPR[0-9]+}} + +define void @bfi_sha256_ma(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) { +entry: + %0 = and i32 %x, %z + %1 = or i32 %x, %z + %2 = and i32 %y, %1 + %3 = or i32 %0, %2 + store i32 %3, i32 addrspace(1)* %out + ret void +} -- 1.8.1.5
>From 2bf7212e559190087101b49e7b09a18f10a85db2 Mon Sep 17 00:00:00 2001 From: Tom Stellard <thomas.stell...@amd.com> Date: Wed, 17 Apr 2013 12:10:21 -0400 Subject: [PATCH 3/3] R600: BFI_INT is a vector-only instruction --- lib/Target/R600/R600Instructions.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index 1630936..78d2320 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600/R600Instructions.td @@ -1635,7 +1635,7 @@ let Predicates = [isEGorCayman] in { VecALU >; - def BFI_INT_eg : R600_3OP <0x06, "BFI_INT", []>; + def BFI_INT_eg : R600_3OP <0x06, "BFI_INT", [], VecALU>; defm : BFIPatterns <BFI_INT_eg>; def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT", -- 1.8.1.5
>From 05c4903985bbe60a54b43ce745d34143895cdf9a Mon Sep 17 00:00:00 2001 From: Tom Stellard <thomas.stell...@amd.com> Date: Mon, 29 Apr 2013 13:40:01 -0400 Subject: [PATCH] R600: Expand vector or, shl, srl, and xor nodes --- lib/Target/R600/R600ISelLowering.cpp | 8 ++++++++ test/CodeGen/R600/or.ll | 23 +++++++++++++++++++++++ test/CodeGen/R600/shl.ll | 23 +++++++++++++++++++++++ test/CodeGen/R600/srl.ll | 23 +++++++++++++++++++++++ test/CodeGen/R600/xor.ll | 23 +++++++++++++++++++++++ 5 files changed, 100 insertions(+) create mode 100644 test/CodeGen/R600/or.ll create mode 100644 test/CodeGen/R600/shl.ll create mode 100644 test/CodeGen/R600/srl.ll create mode 100644 test/CodeGen/R600/xor.ll diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp index b6b7c32..ce3f16f 100644 --- a/lib/Target/R600/R600ISelLowering.cpp +++ b/lib/Target/R600/R600ISelLowering.cpp @@ -54,8 +54,14 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) : setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Expand); setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Expand); setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Expand); + setOperationAction(ISD::OR, MVT::v4i32, Expand); + setOperationAction(ISD::OR, MVT::v2i32, Expand); setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Expand); setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Expand); + setOperationAction(ISD::SHL, MVT::v4i32, Expand); + setOperationAction(ISD::SHL, MVT::v2i32, Expand); + setOperationAction(ISD::SRL, MVT::v4i32, Expand); + setOperationAction(ISD::SRL, MVT::v2i32, Expand); setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Expand); setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Expand); setOperationAction(ISD::UDIV, MVT::v4i32, Expand); @@ -64,6 +70,8 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) : setOperationAction(ISD::UREM, MVT::v2i32, Expand); setOperationAction(ISD::SETCC, MVT::v4i32, Expand); setOperationAction(ISD::SETCC, MVT::v2i32, Expand); + setOperationAction(ISD::XOR, MVT::v4i32, Expand); + setOperationAction(ISD::XOR, MVT::v2i32, Expand); setOperationAction(ISD::BR_CC, MVT::i32, Expand); setOperationAction(ISD::BR_CC, MVT::f32, Expand); diff --git a/test/CodeGen/R600/or.ll b/test/CodeGen/R600/or.ll new file mode 100644 index 0000000..e102a5e --- /dev/null +++ b/test/CodeGen/R600/or.ll @@ -0,0 +1,23 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; CHECK: @or_v2i32 +; CHECK: OR_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; CHECK: OR_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +define void @or_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) { + %result = or <2 x i32> %a, %b + store <2 x i32> %result, <2 x i32> addrspace(1)* %out + ret void +} + +; CHECK: @or_v4i32 +; CHECK: OR_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; CHECK: OR_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; CHECK: OR_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; CHECK: OR_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +define void @or_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) { + %result = or <4 x i32> %a, %b + store <4 x i32> %result, <4 x i32> addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/R600/shl.ll b/test/CodeGen/R600/shl.ll new file mode 100644 index 0000000..88db44d --- /dev/null +++ b/test/CodeGen/R600/shl.ll @@ -0,0 +1,23 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; CHECK: @shl_v2i32 +; CHECK: LSHL T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; CHECK: LSHL T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +define void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) { + %result = shl <2 x i32> %a, %b + store <2 x i32> %result, <2 x i32> addrspace(1)* %out + ret void +} + +; CHECK: @shl_v4i32 +; CHECK: LSHL T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; CHECK: LSHL T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; CHECK: LSHL T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; CHECK: LSHL T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +define void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) { + %result = shl <4 x i32> %a, %b + store <4 x i32> %result, <4 x i32> addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/R600/srl.ll b/test/CodeGen/R600/srl.ll new file mode 100644 index 0000000..ebfb9bc --- /dev/null +++ b/test/CodeGen/R600/srl.ll @@ -0,0 +1,23 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; CHECK: @lshr_v2i32 +; CHECK: LSHR T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; CHECK: LSHR T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +define void @lshr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) { + %result = lshr <2 x i32> %a, %b + store <2 x i32> %result, <2 x i32> addrspace(1)* %out + ret void +} + +; CHECK: @lshr_v4i32 +; CHECK: LSHR T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; CHECK: LSHR T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; CHECK: LSHR T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; CHECK: LSHR T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +define void @lshr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) { + %result = lshr <4 x i32> %a, %b + store <4 x i32> %result, <4 x i32> addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/R600/xor.ll b/test/CodeGen/R600/xor.ll new file mode 100644 index 0000000..109019f --- /dev/null +++ b/test/CodeGen/R600/xor.ll @@ -0,0 +1,23 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; CHECK: @xor_v2i32 +; CHECK: XOR_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; CHECK: XOR_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +define void @xor_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) { + %result = xor <2 x i32> %a, %b + store <2 x i32> %result, <2 x i32> addrspace(1)* %out + ret void +} + +; CHECK: @xor_v4i32 +; CHECK: XOR_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; CHECK: XOR_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; CHECK: XOR_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; CHECK: XOR_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +define void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) { + %result = xor <4 x i32> %a, %b + store <4 x i32> %result, <4 x i32> addrspace(1)* %out + ret void +} -- 1.8.1.5
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev