Re: [Mesa-dev] [PATCH] R600/SI: expose TBUFFER_STORE_FORMAT_* for OpenGL transform feedback

2013-09-05 Thread Tom Stellard
On Mon, Sep 02, 2013 at 09:07:18PM +0200, Marek Olšák wrote:
 For _XYZ, the type of VDATA is v4i32, because v3i32 doesn't exist.
 
 The ADDR64 bit is not exposed. A simpler intrinsic that doesn't take
 a resource descriptor might be nicer.
 
 The maximum number of input SGPRs is bumped to 17.
 
 Signed-off-by: Marek Olšák marek.ol...@amd.com
 ---
  lib/Target/R600/AMDGPUCallingConv.td   |  3 ++-
  lib/Target/R600/AMDGPUISelLowering.cpp |  1 +
  lib/Target/R600/AMDGPUISelLowering.h   |  1 +
  lib/Target/R600/SIISelLowering.cpp | 39 
 ++
  lib/Target/R600/SIInstrInfo.td | 27 +++
  lib/Target/R600/SIInstructions.td  | 29 +
  lib/Target/R600/SIIntrinsics.td| 18 
  7 files changed, 113 insertions(+), 5 deletions(-)
 
 diff --git a/lib/Target/R600/AMDGPUCallingConv.td 
 b/lib/Target/R600/AMDGPUCallingConv.td
 index 84d3118..d26be32 100644
 --- a/lib/Target/R600/AMDGPUCallingConv.td
 +++ b/lib/Target/R600/AMDGPUCallingConv.td
 @@ -19,7 +19,8 @@ def CC_SI : CallingConv[
  
CCIfInRegCCIfType[f32, i32] , CCAssignToReg[
  SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
 -SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15
 +SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
 +SGPR16

Why is this necessary?  Are we using all 16 user sgprs now?

],
  
CCIfInRegCCIfType[i64] , CCAssignToRegWithShadow
 diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp 
 b/lib/Target/R600/AMDGPUISelLowering.cpp
 index 1237323..30d9503 100644
 --- a/lib/Target/R600/AMDGPUISelLowering.cpp
 +++ b/lib/Target/R600/AMDGPUISelLowering.cpp
 @@ -718,5 +718,6 @@ const char* 
 AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(SAMPLED)
NODE_NAME_CASE(SAMPLEL)
NODE_NAME_CASE(STORE_MSKOR)
 +  NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
}
  }
 diff --git a/lib/Target/R600/AMDGPUISelLowering.h 
 b/lib/Target/R600/AMDGPUISelLowering.h
 index 75ac4c2..8a68356 100644
 --- a/lib/Target/R600/AMDGPUISelLowering.h
 +++ b/lib/Target/R600/AMDGPUISelLowering.h
 @@ -160,6 +160,7 @@ enum {
FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
STORE_MSKOR,
LOAD_CONSTANT,
 +  TBUFFER_STORE_FORMAT,
LAST_AMDGPU_ISD_NUMBER
  };
  
 diff --git a/lib/Target/R600/SIISelLowering.cpp 
 b/lib/Target/R600/SIISelLowering.cpp
 index f196059..6fa0c85 100644
 --- a/lib/Target/R600/SIISelLowering.cpp
 +++ b/lib/Target/R600/SIISelLowering.cpp
 @@ -86,6 +86,8 @@ SITargetLowering::SITargetLowering(TargetMachine TM) :
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v16i8, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
  
 +  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
 +
setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Expand);
  
setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
 @@ -462,6 +464,43 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, 
 SelectionDAG DAG) const {
   Op.getOperand(3));
  }
}
 +
 +  case ISD::INTRINSIC_VOID:
 +SDValue Chain = Op.getOperand(0);
 +unsigned IntrinsicID = 
 castConstantSDNode(Op.getOperand(1))-getZExtValue();
 +
 +switch (IntrinsicID) {
 +  case AMDGPUIntrinsic::SI_tbuffer_store: {
 +SDLoc DL(Op);
 +SDValue Ops [] = {
 +  Chain,
 +  ResourceDescriptorToi128(Op.getOperand(2), DAG),
 +  Op.getOperand(3),
 +  Op.getOperand(4),
 +  Op.getOperand(5),
 +  Op.getOperand(6),
 +  Op.getOperand(7),
 +  Op.getOperand(8),
 +  Op.getOperand(9),
 +  Op.getOperand(10),
 +  Op.getOperand(11),
 +  Op.getOperand(12),
 +  Op.getOperand(13),
 +  Op.getOperand(14)
 +};
 +EVT VT = Op.getOperand(3).getValueType();
 +
 +MachineMemOperand *MMO = MF.getMachineMemOperand(
 +MachinePointerInfo(),
 +MachineMemOperand::MOStore,
 +VT.getSizeInBits() / 8, 4);
 +return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
 +   Op-getVTList(), Ops,
 +   sizeof(Ops)/sizeof(Ops[0]), VT, MMO);
 +  }
 +  default:
 +break;
 +}
}
return SDValue();
  }
 diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
 index ecc4718..c902feb 100644
 --- a/lib/Target/R600/SIInstrInfo.td
 +++ b/lib/Target/R600/SIInstrInfo.td
 @@ -21,6 +21,25 @@ def SIload_constant : SDNodeAMDGPUISD::LOAD_CONSTANT,
[SDNPMayLoad, SDNPMemOperand]
  ;
  
 +def SItbuffer_store : SDNodeAMDGPUISD::TBUFFER_STORE_FORMAT,
 +  SDTypeProfile0, 13,
 +[SDTCisVT0, i128,   // rsrc(SGPR)
 + SDTCisVT1, iAny,   // vdata(VGPR)
 + SDTCisVT2, i32,// num_channels(imm)
 + SDTCisVT3, i32,// vaddr(VGPR)
 + SDTCisVT4, i32,// 

Re: [Mesa-dev] [PATCH] R600/SI: expose TBUFFER_STORE_FORMAT_* for OpenGL transform feedback

2013-09-05 Thread Marek Olšák
No, we use 11 user data SGPRs for the vertex shader, but there are
also 6 additional SGPRs loaded by the hw based on the VGT state (4
streamout offsets, streamout_enable, and streamout_write_index). The 6
SGPRs can be enabled by setting SPI_SHADER_PGM_RSRC2_VS.SO_* = 1.

Marek

On Thu, Sep 5, 2013 at 5:44 PM, Tom Stellard t...@stellard.net wrote:
 On Mon, Sep 02, 2013 at 09:07:18PM +0200, Marek Olšák wrote:
 For _XYZ, the type of VDATA is v4i32, because v3i32 doesn't exist.

 The ADDR64 bit is not exposed. A simpler intrinsic that doesn't take
 a resource descriptor might be nicer.

 The maximum number of input SGPRs is bumped to 17.

 Signed-off-by: Marek Olšák marek.ol...@amd.com
 ---
  lib/Target/R600/AMDGPUCallingConv.td   |  3 ++-
  lib/Target/R600/AMDGPUISelLowering.cpp |  1 +
  lib/Target/R600/AMDGPUISelLowering.h   |  1 +
  lib/Target/R600/SIISelLowering.cpp | 39 
 ++
  lib/Target/R600/SIInstrInfo.td | 27 +++
  lib/Target/R600/SIInstructions.td  | 29 +
  lib/Target/R600/SIIntrinsics.td| 18 
  7 files changed, 113 insertions(+), 5 deletions(-)

 diff --git a/lib/Target/R600/AMDGPUCallingConv.td 
 b/lib/Target/R600/AMDGPUCallingConv.td
 index 84d3118..d26be32 100644
 --- a/lib/Target/R600/AMDGPUCallingConv.td
 +++ b/lib/Target/R600/AMDGPUCallingConv.td
 @@ -19,7 +19,8 @@ def CC_SI : CallingConv[

CCIfInRegCCIfType[f32, i32] , CCAssignToReg[
  SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
 -SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15
 +SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
 +SGPR16

 Why is this necessary?  Are we using all 16 user sgprs now?

],

CCIfInRegCCIfType[i64] , CCAssignToRegWithShadow
 diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp 
 b/lib/Target/R600/AMDGPUISelLowering.cpp
 index 1237323..30d9503 100644
 --- a/lib/Target/R600/AMDGPUISelLowering.cpp
 +++ b/lib/Target/R600/AMDGPUISelLowering.cpp
 @@ -718,5 +718,6 @@ const char* 
 AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(SAMPLED)
NODE_NAME_CASE(SAMPLEL)
NODE_NAME_CASE(STORE_MSKOR)
 +  NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
}
  }
 diff --git a/lib/Target/R600/AMDGPUISelLowering.h 
 b/lib/Target/R600/AMDGPUISelLowering.h
 index 75ac4c2..8a68356 100644
 --- a/lib/Target/R600/AMDGPUISelLowering.h
 +++ b/lib/Target/R600/AMDGPUISelLowering.h
 @@ -160,6 +160,7 @@ enum {
FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
STORE_MSKOR,
LOAD_CONSTANT,
 +  TBUFFER_STORE_FORMAT,
LAST_AMDGPU_ISD_NUMBER
  };

 diff --git a/lib/Target/R600/SIISelLowering.cpp 
 b/lib/Target/R600/SIISelLowering.cpp
 index f196059..6fa0c85 100644
 --- a/lib/Target/R600/SIISelLowering.cpp
 +++ b/lib/Target/R600/SIISelLowering.cpp
 @@ -86,6 +86,8 @@ SITargetLowering::SITargetLowering(TargetMachine TM) :
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v16i8, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);

 +  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
 +
setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Expand);

setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
 @@ -462,6 +464,43 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, 
 SelectionDAG DAG) const {
   Op.getOperand(3));
  }
}
 +
 +  case ISD::INTRINSIC_VOID:
 +SDValue Chain = Op.getOperand(0);
 +unsigned IntrinsicID = 
 castConstantSDNode(Op.getOperand(1))-getZExtValue();
 +
 +switch (IntrinsicID) {
 +  case AMDGPUIntrinsic::SI_tbuffer_store: {
 +SDLoc DL(Op);
 +SDValue Ops [] = {
 +  Chain,
 +  ResourceDescriptorToi128(Op.getOperand(2), DAG),
 +  Op.getOperand(3),
 +  Op.getOperand(4),
 +  Op.getOperand(5),
 +  Op.getOperand(6),
 +  Op.getOperand(7),
 +  Op.getOperand(8),
 +  Op.getOperand(9),
 +  Op.getOperand(10),
 +  Op.getOperand(11),
 +  Op.getOperand(12),
 +  Op.getOperand(13),
 +  Op.getOperand(14)
 +};
 +EVT VT = Op.getOperand(3).getValueType();
 +
 +MachineMemOperand *MMO = MF.getMachineMemOperand(
 +MachinePointerInfo(),
 +MachineMemOperand::MOStore,
 +VT.getSizeInBits() / 8, 4);
 +return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
 +   Op-getVTList(), Ops,
 +   sizeof(Ops)/sizeof(Ops[0]), VT, MMO);
 +  }
 +  default:
 +break;
 +}
}
return SDValue();
  }
 diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
 index ecc4718..c902feb 100644
 --- a/lib/Target/R600/SIInstrInfo.td
 +++ b/lib/Target/R600/SIInstrInfo.td
 @@ -21,6 +21,25 @@ def SIload_constant : SDNodeAMDGPUISD::LOAD_CONSTANT,
  

[Mesa-dev] [PATCH] R600/SI: expose TBUFFER_STORE_FORMAT_* for OpenGL transform feedback

2013-09-02 Thread Marek Olšák
For _XYZ, the type of VDATA is v4i32, because v3i32 doesn't exist.

The ADDR64 bit is not exposed. A simpler intrinsic that doesn't take
a resource descriptor might be nicer.

The maximum number of input SGPRs is bumped to 17.

Signed-off-by: Marek Olšák marek.ol...@amd.com
---
 lib/Target/R600/AMDGPUCallingConv.td   |  3 ++-
 lib/Target/R600/AMDGPUISelLowering.cpp |  1 +
 lib/Target/R600/AMDGPUISelLowering.h   |  1 +
 lib/Target/R600/SIISelLowering.cpp | 39 ++
 lib/Target/R600/SIInstrInfo.td | 27 +++
 lib/Target/R600/SIInstructions.td  | 29 +
 lib/Target/R600/SIIntrinsics.td| 18 
 7 files changed, 113 insertions(+), 5 deletions(-)

diff --git a/lib/Target/R600/AMDGPUCallingConv.td 
b/lib/Target/R600/AMDGPUCallingConv.td
index 84d3118..d26be32 100644
--- a/lib/Target/R600/AMDGPUCallingConv.td
+++ b/lib/Target/R600/AMDGPUCallingConv.td
@@ -19,7 +19,8 @@ def CC_SI : CallingConv[
 
   CCIfInRegCCIfType[f32, i32] , CCAssignToReg[
 SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
-SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15
+SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
+SGPR16
   ],
 
   CCIfInRegCCIfType[i64] , CCAssignToRegWithShadow
diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp 
b/lib/Target/R600/AMDGPUISelLowering.cpp
index 1237323..30d9503 100644
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -718,5 +718,6 @@ const char* 
AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(SAMPLED)
   NODE_NAME_CASE(SAMPLEL)
   NODE_NAME_CASE(STORE_MSKOR)
+  NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
   }
 }
diff --git a/lib/Target/R600/AMDGPUISelLowering.h 
b/lib/Target/R600/AMDGPUISelLowering.h
index 75ac4c2..8a68356 100644
--- a/lib/Target/R600/AMDGPUISelLowering.h
+++ b/lib/Target/R600/AMDGPUISelLowering.h
@@ -160,6 +160,7 @@ enum {
   FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
   STORE_MSKOR,
   LOAD_CONSTANT,
+  TBUFFER_STORE_FORMAT,
   LAST_AMDGPU_ISD_NUMBER
 };
 
diff --git a/lib/Target/R600/SIISelLowering.cpp 
b/lib/Target/R600/SIISelLowering.cpp
index f196059..6fa0c85 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -86,6 +86,8 @@ SITargetLowering::SITargetLowering(TargetMachine TM) :
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v16i8, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
 
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
+
   setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Expand);
 
   setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
@@ -462,6 +464,43 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, 
SelectionDAG DAG) const {
  Op.getOperand(3));
 }
   }
+
+  case ISD::INTRINSIC_VOID:
+SDValue Chain = Op.getOperand(0);
+unsigned IntrinsicID = 
castConstantSDNode(Op.getOperand(1))-getZExtValue();
+
+switch (IntrinsicID) {
+  case AMDGPUIntrinsic::SI_tbuffer_store: {
+SDLoc DL(Op);
+SDValue Ops [] = {
+  Chain,
+  ResourceDescriptorToi128(Op.getOperand(2), DAG),
+  Op.getOperand(3),
+  Op.getOperand(4),
+  Op.getOperand(5),
+  Op.getOperand(6),
+  Op.getOperand(7),
+  Op.getOperand(8),
+  Op.getOperand(9),
+  Op.getOperand(10),
+  Op.getOperand(11),
+  Op.getOperand(12),
+  Op.getOperand(13),
+  Op.getOperand(14)
+};
+EVT VT = Op.getOperand(3).getValueType();
+
+MachineMemOperand *MMO = MF.getMachineMemOperand(
+MachinePointerInfo(),
+MachineMemOperand::MOStore,
+VT.getSizeInBits() / 8, 4);
+return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
+   Op-getVTList(), Ops,
+   sizeof(Ops)/sizeof(Ops[0]), VT, MMO);
+  }
+  default:
+break;
+}
   }
   return SDValue();
 }
diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
index ecc4718..c902feb 100644
--- a/lib/Target/R600/SIInstrInfo.td
+++ b/lib/Target/R600/SIInstrInfo.td
@@ -21,6 +21,25 @@ def SIload_constant : SDNodeAMDGPUISD::LOAD_CONSTANT,
   [SDNPMayLoad, SDNPMemOperand]
 ;
 
+def SItbuffer_store : SDNodeAMDGPUISD::TBUFFER_STORE_FORMAT,
+  SDTypeProfile0, 13,
+[SDTCisVT0, i128,   // rsrc(SGPR)
+ SDTCisVT1, iAny,   // vdata(VGPR)
+ SDTCisVT2, i32,// num_channels(imm)
+ SDTCisVT3, i32,// vaddr(VGPR)
+ SDTCisVT4, i32,// soffset(SGPR)
+ SDTCisVT5, i32,// inst_offset(imm)
+ SDTCisVT6, i32,// dfmt(imm)
+ SDTCisVT7, i32,// nfmt(imm)
+ SDTCisVT8, i32,// offen(imm)
+ SDTCisVT9, i32,// idxen(imm)
+ SDTCisVT10, i32,   // glc(imm)
+ SDTCisVT11,