[Mesa-dev] R600/SI: Intrinsics for derivatives

Michel Dänzer Fri, 07 Jun 2013 08:28:05 -0700

The most important difference to the previous version of these is that
whole quad mode is now enabled and M0 initialized appropriately for the
LDS instructions, which now allows all of the relevant piglit tests to
pass.



-- 
Earthling Michel Dänzer           |                   http://www.amd.com
Libre software enthusiast         |          Debian, X and DRI developer

From db07ab94113be5810fd6d1035b3d394ed53d27ca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michel=20D=C3=A4nzer?= <michel.daen...@amd.com>
Date: Thu, 21 Feb 2013 16:12:45 +0100
Subject: [PATCH 1/3] R600/SI: Add intrinsics for texture sampling with user
 derivatives
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Michel Dänzer <michel.daen...@amd.com>
---
 lib/Target/R600/SIInstructions.td | 7 ++++++-
 lib/Target/R600/SIIntrinsics.td   | 1 +
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index b6db815..73f87ca 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -535,7 +535,7 @@ def IMAGE_SAMPLE_B : MIMG_Sampler_Helper <0x00000025, "IMAGE_SAMPLE_B">;
 //def IMAGE_SAMPLE_LZ : MIMG_NoPattern_ <"IMAGE_SAMPLE_LZ", 0x00000027>;
 def IMAGE_SAMPLE_C : MIMG_Sampler_Helper <0x00000028, "IMAGE_SAMPLE_C">;
 //def IMAGE_SAMPLE_C_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CL", 0x00000029>;
-//def IMAGE_SAMPLE_C_D : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D", 0x0000002a>;
+def IMAGE_SAMPLE_C_D : MIMG_Sampler_Helper <0x0000002a, "IMAGE_SAMPLE_C_D">;
 //def IMAGE_SAMPLE_C_D_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D_CL", 0x0000002b>;
 def IMAGE_SAMPLE_C_L : MIMG_Sampler_Helper <0x0000002c, "IMAGE_SAMPLE_C_L">;
 def IMAGE_SAMPLE_C_B : MIMG_Sampler_Helper <0x0000002d, "IMAGE_SAMPLE_C_B">;
@@ -1296,6 +1296,11 @@ multiclass SamplePatterns<ValueType addr_type> {
   def : SampleArrayPattern <int_SI_sampleb, IMAGE_SAMPLE_B, addr_type>;
   def : SampleShadowPattern <int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_type>;
   def : SampleShadowArrayPattern <int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_type>;
+
+  def : SamplePattern <int_SI_sampled, IMAGE_SAMPLE_D, addr_type>;
+  def : SampleArrayPattern <int_SI_sampled, IMAGE_SAMPLE_D, addr_type>;
+  def : SampleShadowPattern <int_SI_sampled, IMAGE_SAMPLE_C_D, addr_type>;
+  def : SampleShadowArrayPattern <int_SI_sampled, IMAGE_SAMPLE_C_D, addr_type>;
 }
 
 defm : SamplePatterns<v2i32>;
diff --git a/lib/Target/R600/SIIntrinsics.td b/lib/Target/R600/SIIntrinsics.td
index 224cd2f..d2643e0 100644
--- a/lib/Target/R600/SIIntrinsics.td
+++ b/lib/Target/R600/SIIntrinsics.td
@@ -23,6 +23,7 @@ let TargetPrefix = "SI", isTarget = 1 in {
 
   def int_SI_sample : Sample;
   def int_SI_sampleb : Sample;
+  def int_SI_sampled : Sample;
   def int_SI_samplel : Sample;
 
   def int_SI_imageload : Intrinsic <[llvm_v4i32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
-- 
1.8.3

From 466936a680993dec58e1e537f3b489cd82b5176c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michel=20D=C3=A4nzer?= <michel.daen...@amd.com>
Date: Thu, 21 Feb 2013 18:51:38 +0100
Subject: [PATCH 2/3] R600/SI: Initial support for LDS/GDS instructions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Michel Dänzer <michel.daen...@amd.com>
---
 lib/Target/R600/SIInstrFormats.td      | 24 ++++++++++++++++++++++++
 lib/Target/R600/SIInstrInfo.td         | 23 +++++++++++++++++++++++
 lib/Target/R600/SIInstructions.td      |  3 +++
 lib/Target/R600/SILowerControlFlow.cpp | 16 ++++++++++++++++
 4 files changed, 66 insertions(+)

diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td
index 51f323d..434aa7e 100644
--- a/lib/Target/R600/SIInstrFormats.td
+++ b/lib/Target/R600/SIInstrFormats.td
@@ -281,6 +281,30 @@ class VINTRP <bits <2> op, dag outs, dag ins, string asm, list<dag> pattern> :
 
 let Uses = [EXEC] in {
 
+class DS <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    Enc64 <outs, ins, asm, pattern> {
+
+  bits<8> vdst;
+  bits<1> gds;
+  bits<8> addr;
+  bits<8> data0;
+  bits<8> data1;
+  bits<8> offset0;
+  bits<8> offset1;
+
+  let Inst{7-0} = offset0;
+  let Inst{15-8} = offset1;
+  let Inst{17} = gds;
+  let Inst{25-18} = op;
+  let Inst{31-26} = 0x36; //encoding
+  let Inst{39-32} = addr;
+  let Inst{47-40} = data0;
+  let Inst{55-48} = data1;
+  let Inst{63-56} = vdst;
+
+  let LGKM_CNT = 1;
+}
+
 class MUBUF <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
     Enc64<outs, ins, asm, pattern> {
 
diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
index 42fa95f..47a64f7 100644
--- a/lib/Target/R600/SIInstrInfo.td
+++ b/lib/Target/R600/SIInstrInfo.td
@@ -286,6 +286,29 @@ class VOP3_64 <bits<9> op, string opName, list<dag> pattern> : VOP3 <
 // Vector I/O classes
 //===----------------------------------------------------------------------===//
 
+class DS_Load_Helper <bits<8> op, string asm, RegisterClass regClass> : DS <
+  op,
+  (outs regClass:$vdst),
+  (ins i1imm:$gds, VReg_32:$addr, VReg_32:$data0, VReg_32:$data1,
+       i8imm:$offset0, i8imm:$offset1),
+  asm#" $vdst, $gds, $addr, $data0, $data1, $offset0, $offset1, [M0]",
+  []> {
+  let mayLoad = 1;
+  let mayStore = 0;
+}
+
+class DS_Store_Helper <bits<8> op, string asm, RegisterClass regClass> : DS <
+  op,
+  (outs),
+  (ins regClass:$vdata, i1imm:$gds, VReg_32:$addr, VReg_32:$data0, VReg_32:$data1,
+       i8imm:$offset0, i8imm:$offset1),
+  asm#" $gds, $addr, $data0, $data1, $offset0, $offset1, [M0]",
+  []> {
+  let mayStore = 1;
+  let mayLoad = 0;
+  let vdst = 0;
+}
+
 class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
   op,
   (outs),
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index 73f87ca..d8fbf3e 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -391,6 +391,9 @@ defm V_CMPX_CLASS_F64 : VOPC_64 <0x000000b8, "V_CMPX_CLASS_F64">;
 
 } // End isCompare = 1
 
+def DS_WRITE_B32 : DS_Store_Helper <0x0000000d, "DS_WRITE_B32", VReg_32>;
+def DS_READ_B32 : DS_Load_Helper <0x00000036, "DS_READ_B32", VReg_32>;
+
 //def BUFFER_LOAD_FORMAT_X : MUBUF_ <0x00000000, "BUFFER_LOAD_FORMAT_X", []>;
 //def BUFFER_LOAD_FORMAT_XY : MUBUF_ <0x00000001, "BUFFER_LOAD_FORMAT_XY", []>;
 //def BUFFER_LOAD_FORMAT_XYZ : MUBUF_ <0x00000002, "BUFFER_LOAD_FORMAT_XYZ", []>;
diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp
index 2b60eb9..900309a 100644
--- a/lib/Target/R600/SILowerControlFlow.cpp
+++ b/lib/Target/R600/SILowerControlFlow.cpp
@@ -410,6 +410,7 @@ void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) {
 bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
 
   bool HaveKill = false;
+  bool NeedM0 = false;
   bool NeedWQM = false;
   unsigned Depth = 0;
 
@@ -481,6 +482,13 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
           IndirectDst(MI);
           break;
 
+        case AMDGPU::DS_READ_B32:
+          NeedWQM = true;
+          // Fall through
+        case AMDGPU::DS_WRITE_B32:
+          NeedM0 = true;
+          break;
+
         case AMDGPU::V_INTERP_P1_F32:
         case AMDGPU::V_INTERP_P2_F32:
         case AMDGPU::V_INTERP_MOV_F32:
@@ -491,6 +499,14 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
     }
   }
 
+  if (NeedM0) {
+    MachineBasicBlock &MBB = MF.front();
+    // Initialize M0 to a value that won't cause LDS access to be discarded
+    // due to offset clamping
+    BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_MOV_B32),
+            AMDGPU::M0).addImm(0xffffffff);
+  }
+
   if (NeedWQM) {
     MachineBasicBlock &MBB = MF.front();
     BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
-- 
1.8.3

From bb5adcd52cc5cadc308e85f635675199f5c02f35 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michel=20D=C3=A4nzer?= <michel.daen...@amd.com>
Date: Thu, 21 Feb 2013 17:56:22 +0100
Subject: [PATCH 3/3] R600/SI: Support AMDGPU.ddx/y intrinsics
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use LDS for calculating the deltas between neighbouring pixels.

Signed-off-by: Michel Dänzer <michel.daen...@amd.com>
---
 lib/Target/R600/SIISelLowering.cpp | 77 +++++++++++++++++++++++++++++++++++++-
 lib/Target/R600/SIISelLowering.h   |  6 +++
 lib/Target/R600/SIInstructions.td  | 42 ++++++++++++++++++++-
 3 files changed, 121 insertions(+), 4 deletions(-)

diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
index ac6a4c3..7ea226a 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -249,7 +249,7 @@ SDValue SITargetLowering::LowerFormalArguments(
 
 MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
     MachineInstr * MI, MachineBasicBlock * BB) const {
-
+  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
   MachineBasicBlock::iterator I = *MI;
 
   switch (MI->getOpcode()) {
@@ -257,7 +257,6 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
     return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
   case AMDGPU::BRANCH: return BB;
   case AMDGPU::SI_ADDR64_RSRC: {
-    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
     unsigned SuperReg = MI->getOperand(0).getReg();
     unsigned SubRegLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
     unsigned SubRegHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
@@ -282,10 +281,84 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
     MI->eraseFromParent();
     break;
   }
+  case AMDGPU::SI_DD:
+    LowerSI_DD(MI, *BB, I, MRI);
+    break;
+  case AMDGPU::SI_TID:
+    LowerSI_TID(MI, *BB, I, MRI);
+    break;
   }
   return BB;
 }
 
+void SITargetLowering::LowerSI_DD(MachineInstr *MI, MachineBasicBlock &BB,
+    MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const {
+  unsigned coord0 = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+  unsigned coord1 = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+  MachineOperand dst = MI->getOperand(0);
+  MachineOperand coord = MI->getOperand(1);
+  MachineOperand ldsaddr = MI->getOperand(2);
+  MachineOperand ldsaddr0 = MI->getOperand(3);
+  MachineOperand ldsdelta = MI->getOperand(4);
+
+  // Write this thread's coordinate to LDS
+  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::DS_WRITE_B32))
+          .addOperand(coord)
+          .addImm(0) // LDS
+          .addOperand(ldsaddr)
+          .addOperand(coord)
+          .addOperand(coord)
+          .addImm(0)
+          .addImm(0);
+
+  // Read top right / bottom left thread's coordinate from LDS
+  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::DS_READ_B32), coord0)
+          .addImm(0) // LDS
+          .addOperand(ldsaddr0)
+          .addOperand(ldsaddr0)
+          .addOperand(ldsaddr0)
+          .addOperand(ldsdelta)
+          .addImm(0);
+
+  // Read top left thread's coordinate from LDS
+  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::DS_READ_B32), coord1)
+          .addImm(0) // LDS
+          .addOperand(ldsaddr0)
+          .addOperand(ldsaddr0)
+          .addOperand(ldsaddr0)
+          .addImm(0)
+          .addImm(0);
+
+  // Subtract top left coordinate from top right / bottom left
+  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_SUB_F32_e32))
+          .addOperand(dst)
+          .addReg(coord0)
+          .addReg(coord1);
+
+  MI->eraseFromParent();
+}
+
+void SITargetLowering::LowerSI_TID(MachineInstr *MI, MachineBasicBlock &BB,
+    MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const {
+  unsigned mbcnt_lo = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+  MachineOperand dst = MI->getOperand(0);
+
+  // Get this thread's ID
+  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_MBCNT_LO_U32_B32_e64), mbcnt_lo)
+          .addImm(0xffffffff)
+          .addImm(0)
+          .addImm(0)
+          .addImm(0)
+          .addImm(0)
+          .addImm(0);
+  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_MBCNT_HI_U32_B32_e32))
+          .addOperand(dst)
+          .addImm(0xffffffff)
+          .addReg(mbcnt_lo);
+
+  MI->eraseFromParent();
+}
+
 EVT SITargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
   return MVT::i1;
 }
diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h
index 9b263b9..1a6538d 100644
--- a/lib/Target/R600/SIISelLowering.h
+++ b/lib/Target/R600/SIISelLowering.h
@@ -26,6 +26,12 @@ class SITargetLowering : public AMDGPUTargetLowering {
 
   SDValue LowerParameter(SelectionDAG &DAG, EVT VT, SDLoc DL,
                          SDValue Chain, unsigned Offset) const;
+
+  void LowerSI_DD(MachineInstr *MI, MachineBasicBlock &BB,
+              MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const;
+  void LowerSI_TID(MachineInstr *MI, MachineBasicBlock &BB,
+              MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const;
+
   SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index d8fbf3e..1126729 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -903,8 +903,8 @@ defm V_MAC_F32 : VOP2_32 <0x0000001f, "V_MAC_F32", []>;
 defm V_MADMK_F32 : VOP2_32 <0x00000020, "V_MADMK_F32", []>;
 defm V_MADAK_F32 : VOP2_32 <0x00000021, "V_MADAK_F32", []>;
 //defm V_BCNT_U32_B32 : VOP2_32 <0x00000022, "V_BCNT_U32_B32", []>;
-//defm V_MBCNT_LO_U32_B32 : VOP2_32 <0x00000023, "V_MBCNT_LO_U32_B32", []>;
-//defm V_MBCNT_HI_U32_B32 : VOP2_32 <0x00000024, "V_MBCNT_HI_U32_B32", []>;
+defm V_MBCNT_LO_U32_B32 : VOP2_32 <0x00000023, "V_MBCNT_LO_U32_B32", []>;
+defm V_MBCNT_HI_U32_B32 : VOP2_32 <0x00000024, "V_MBCNT_HI_U32_B32", []>;
 
 let isCommutable = 1, Defs = [VCC] in { // Carry-out goes to VCC
 defm V_ADD_I32 : VOP2b_32 <0x00000025, "V_ADD_I32",
@@ -1110,6 +1110,24 @@ def LOAD_CONST : AMDGPUShaderInst <
   [(set GPRF32:$dst, (int_AMDGPU_load_const imm:$src))]
 >;
 
+let usesCustomInserter = 1 in {
+
+def SI_DD : InstSI <
+  (outs VReg_32:$dst),
+  (ins VReg_32:$src, VReg_32:$lds_addr, VReg_32:$lds_addr0, i8imm:$ldsdelta),
+  "SI_DD $src, $lds_addr, $lds_addr0, $ldsdelta",
+  []
+>;
+
+def SI_TID : InstSI <
+  (outs VReg_32:$dst),
+  (ins),
+  "SI_TID",
+  []
+>;
+
+} // end usesCustomInserter
+
 // SI Psuedo instructions. These are used by the CFG structurizer pass
 // and should be lowered to ISA instructions prior to codegen.
 
@@ -1544,6 +1562,26 @@ def : Pat <
                    sub3)
 >;
 
+class DDXY <Intrinsic name, bits<4> ldsdelta> : Pat <
+  (name v4f32:$src, imm, imm, imm),
+  (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)),
+    (SI_DD (EXTRACT_SUBREG $src, sub0), (V_LSHLREV_B32_e32 2, (SI_TID)),
+           (V_AND_B32_e32 0xfffffff0, (V_LSHLREV_B32_e32 2, (SI_TID))),
+           ldsdelta), sub0),
+    (SI_DD (EXTRACT_SUBREG $src, sub1), (V_LSHLREV_B32_e32 2, (SI_TID)),
+           (V_AND_B32_e32 0xfffffff0, (V_LSHLREV_B32_e32 2, (SI_TID))),
+           ldsdelta), sub1),
+    (SI_DD (EXTRACT_SUBREG $src, sub2), (V_LSHLREV_B32_e32 2, (SI_TID)),
+           (V_AND_B32_e32 0xfffffff0, (V_LSHLREV_B32_e32 2, (SI_TID))),
+           ldsdelta), sub2),
+    (SI_DD (EXTRACT_SUBREG $src, sub3), (V_LSHLREV_B32_e32 2, (SI_TID)),
+           (V_AND_B32_e32 0xfffffff0, (V_LSHLREV_B32_e32 2, (SI_TID))),
+           ldsdelta), sub3)
+>;
+
+def : DDXY<int_AMDGPU_ddx, 4>;
+def : DDXY<int_AMDGPU_ddy, 8>;
+
 def : Pat <
   (i32 (sext i1:$src0)),
   (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src0)
-- 
1.8.3

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] R600/SI: Intrinsics for derivatives

Reply via email to